def test_filename(self): """Verify that an appropriate file name is generated in an appropriate folder""" file_path = tempfile.mkdtemp() + os.path.join('some', 'depth', 'here') cp = Checkpointer(file_path) cp.counter = 25 filename = cp._filename('A WeIrD TaG') self.assertTrue(os.path.join('some', 'depth', 'here') in filename) self.assertTrue('25' in filename) self.assertTrue('aweirdtag' in filename)
def build_by_notice(filename, title, act_title, act_section, notice_doc_numbers, doc_number=None, checkpoint=None): with codecs.open(filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() if checkpoint: checkpointer = Checkpointer(checkpoint) else: checkpointer = NullCheckpointer() # build the initial tree reg_tree = checkpointer.checkpoint("init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() if doc_number is None: doc_number = Builder.determine_doc_number(reg, title, title_part) checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # create the builder builder = Builder(cfr_title=title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.fetch_notices_json() for notice in notice_doc_numbers: builder.build_notice_from_doc_number(notice) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() act_title_and_section = [act_title, act_section] builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(reg_tree, act_title_and_section, builder, layer_cache)
def test_dont_load_later_elements(self): """If a checkpoint is executed, we should not load any later checkpoints. This allows a user to delete, say step 5, and effectively rebuild from that checkpoint.""" cp = Checkpointer(tempfile.mkdtemp()) self.assertEqual(cp.checkpoint("1", lambda: 1), 1) self.assertEqual(cp.checkpoint("2", lambda: 2), 2) self.assertEqual(cp.checkpoint("3", lambda: 3), 3) cp._reset() self.assertEqual(cp.checkpoint("1", lambda: -1), 1) self.assertEqual(cp.checkpoint("2", lambda: -2, force=True), -2) self.assertEqual(cp.checkpoint("3", lambda: -3), -3)
def build_by_notice(filename, title, act_title, act_section, notice_doc_numbers, doc_number=None, checkpoint=None): with codecs.open(filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() if checkpoint: checkpointer = Checkpointer(checkpoint) else: checkpointer = NullCheckpointer() # build the initial tree reg_tree = checkpointer.checkpoint( "init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() if doc_number is None: doc_number = Builder.determine_doc_number(reg, title, title_part) checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # create the builder builder = Builder(cfr_title=title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.fetch_notices_json() for notice in notice_doc_numbers: builder.build_notice_from_doc_number(notice) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() act_title_and_section = [act_title, act_section] builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(reg_tree, act_title_and_section, builder, layer_cache)
def parse_regulation(args): """ Run the parser on the specified command-line arguments. Broken out into separate function to assist in profiling. """ with codecs.open(args.filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() act_title_and_section = [args.act_title, args.act_section] if args.checkpoint: checkpointer = Checkpointer(args.checkpoint) else: checkpointer = NullCheckpointer() # First, the regulation tree reg_tree = checkpointer.checkpoint( "init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() doc_number = checkpointer.checkpoint( "doc-number-" + file_digest, lambda: Builder.determine_doc_number(reg, args.title, title_part)) if not doc_number: raise ValueError("Could not determine document number") checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # Run Builder builder = Builder(cfr_title=args.title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.write_notices() # Always do at least the first reg logger.info("Version %s", doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(doc_number, reg_tree, act_title_and_section, builder, layer_cache, checkpointer)
def test_exception_reading(self): """If a file exists but is not the correct format, we expect deserialization to gracefully fail (rather than exploding)""" cp = Checkpointer(tempfile.mkdtemp()) self.assertEqual(1, cp.checkpoint("1", lambda: 1)) with open(cp._filename("1"), "w") as written_file: written_file.write("") cp._reset() # pickle will raise an exception, so we will recompute self.assertEqual(-1, cp.checkpoint("1", lambda: -1))
def test_tree_serialization(self): """Trees have embedded XML, which doesn't serialize well""" tree = Node(text="top", label=["111"], title="Reg 111", children=[ Node(text="inner", label=["111", "1"], source_xml=etree.fromstring("""<tag>Hi</tag>""")) ]) cp = Checkpointer(tempfile.mkdtemp()) cp.checkpoint("a-tag", lambda: tree) # saving cp._reset() loaded = cp.checkpoint("a-tag", None) # would explode if not loaded self.assertEqual(repr(tree), repr(loaded)) self.assertEqual(etree.tostring(tree.children[0].source_xml), etree.tostring(loaded.children[0].source_xml))
def test_tree_serialization(self): """Trees have embedded XML, which doesn't serialize well""" tree = Node( text="top", label=["111"], title="Reg 111", children=[ Node(text="inner", label=["111", "1"], source_xml=etree.fromstring("""<tag>Hi</tag>""")) ]) cp = Checkpointer(tempfile.mkdtemp()) cp.checkpoint("a-tag", lambda: tree) # saving cp._reset() loaded = cp.checkpoint("a-tag", None) # would explode if not loaded self.assertEqual(repr(tree), repr(loaded)) self.assertEqual( etree.tostring(tree.children[0].source_xml), etree.tostring(loaded.children[0].source_xml))
def test_basic_serialization(self): """We should be able to store and retrieve an object. Verify that this is occurring outside of local memory by comparing to the original.""" to_store = {"some": "value", 123: 456} cp = Checkpointer(tempfile.mkdtemp()) cp.counter = 1 cp._serialize("a-tag", to_store) to_store["some"] = "other" result = cp._deserialize("a-tag") self.assertEqual(result, {"some": "value", 123: 456}) self.assertEqual(to_store, {"some": "other", 123: 456}) cp.counter = 2 cp._serialize("a-tag", to_store) to_store["some"] = "more" result = cp._deserialize("a-tag") self.assertEqual(result, {"some": "other", 123: 456}) self.assertEqual(to_store, {"some": "more", 123: 456}) cp.counter = 1 result = cp._deserialize("a-tag") self.assertEqual(result, {"some": "value", 123: 456})
def test_dirs_created(self): """If the full path does not exist, it is created""" file_path = tempfile.mkdtemp() + os.path.join('some', 'depth', 'here') Checkpointer(file_path) self.assertTrue(os.path.isdir(file_path))
parser.add_argument('act_title', type=int, help='Act title', action='store') parser.add_argument('act_section', type=int, help='Act section') parser.add_argument('--generate-diffs', type=bool, help='Generate diffs?', required=False, default=True) parser.add_argument('--checkpoint', required=False, help='Directory to save checkpoint data') args = parser.parse_args() with codecs.open(args.filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() act_title_and_section = [args.act_title, args.act_section] if args.checkpoint: checkpointer = Checkpointer(args.checkpoint) else: checkpointer = NullCheckpointer() # First, the regulation tree reg_tree = checkpointer.checkpoint( "init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() doc_number = checkpointer.checkpoint( "doc-number-" + file_digest, lambda: Builder.determine_doc_number(reg, args.title, title_part)) if not doc_number: raise ValueError("Could not determine document number") checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number])