def generate_diffs(reg_tree, act_title_and_section, builder, layer_cache): """ Generate all the diffs for the given regulation. Broken out into separate function to assist with profiling so it's easier to determine which parts of the parser take the most time """ doc_number, checkpointer = builder.doc_number, builder.checkpointer all_versions = {doc_number: FrozenNode.from_node(reg_tree)} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = FrozenNode.from_node(new_tree) builder.doc_number = version builder.write_regulation(new_tree) layer_cache.invalidate_by_notice(last_notice) builder.gen_and_write_layers(new_tree, act_title_and_section, layer_cache, notices) layer_cache.replace_using(new_tree) del last_notice, old, new_tree, notices # free some memory label_id = reg_tree.label_id() writer = builder.writer del reg_tree, layer_cache, builder # free some memory # now build diffs - include "empty" diffs comparing a version to itself for lhs_version, lhs_tree in all_versions.iteritems(): for rhs_version, rhs_tree in all_versions.iteritems(): changes = checkpointer.checkpoint( "-".join(["diff", lhs_version, rhs_version]), lambda: dict(changes_between(lhs_tree, rhs_tree))) writer.diff(label_id, lhs_version, rhs_version).write(changes)
def write_notice(self, doc_number, old_tree=None, reg_tree=None, layers=None, last_version=''): """ Write a single notice out. For the XMLWriter, we need to include the reg_tree for the notice. """ # Get the notice by doc number notice = next((n for n in self.notices if n['document_number'] == doc_number), None) # We can optionall write out the diffs with the notice if we're # given the old tree. changes = {} if old_tree is not None and reg_tree is not None: # FrozenNode and Node are not API-compatible. This is # troublesome. changes = dict(changes_between( FrozenNode.from_node(old_tree), FrozenNode.from_node(reg_tree))) # Write the notice writer = self.writer.notice(self.cfr_part, self.doc_number, notices=self.notices, layers=layers) writer.write(notice, changes=changes, reg_tree=reg_tree, left_doc_number=last_version)
def generate_diffs(reg_tree, act_title_and_section, builder, layer_cache): """ Generate all the diffs for the given regulation. Broken out into separate function to assist with profiling so it's easier to determine which parts of the parser take the most time """ doc_number, checkpointer = builder.doc_number, builder.checkpointer all_versions = {doc_number: FrozenNode.from_node(reg_tree)} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = FrozenNode.from_node(new_tree) builder.doc_number = version builder.write_regulation(new_tree) layer_cache.invalidate_by_notice(last_notice) builder.gen_and_write_layers(new_tree, act_title_and_section, layer_cache, notices) layer_cache.replace_using(new_tree) del last_notice, old, new_tree, notices # free some memory label_id = reg_tree.label_id() writer = builder.writer del reg_tree, layer_cache, builder # free some memory # now build diffs - include "empty" diffs comparing a version to itself for lhs_version, lhs_tree in all_versions.iteritems(): for rhs_version, rhs_tree in all_versions.iteritems(): changes = checkpointer.checkpoint( "-".join(["diff", lhs_version, rhs_version]), lambda: dict(changes_between(lhs_tree, rhs_tree))) writer.diff( label_id, lhs_version, rhs_version ).write(changes)
def write_notice(self, doc_number, old_tree=None, reg_tree=None, layers=None, last_version=''): """ Write a single notice out. For the XMLWriter, we need to include the reg_tree for the notice. """ # Get the notice by doc number notice = next( (n for n in self.notices if n['document_number'] == doc_number), None) # We can optionall write out the diffs with the notice if we're # given the old tree. changes = {} if old_tree is not None and reg_tree is not None: # FrozenNode and Node are not API-compatible. This is # troublesome. changes = dict( changes_between(FrozenNode.from_node(old_tree), FrozenNode.from_node(reg_tree))) # Write the notice writer = self.writer.notice(self.cfr_part, self.doc_number, notices=self.notices, layers=layers) writer.write(notice, changes=changes, reg_tree=reg_tree, left_doc_number=last_version)
def test_title_disappears(self): lhs = FrozenNode("Text", title="Some Title", label=['1111']) rhs = FrozenNode("Text", title=None, label=['1111']) result = dict(difftree.changes_between(lhs, rhs)) self.assertEqual( result['1111'], {'title': [('delete', 0, 10)], 'op': 'modified'})
def test_title_disappears(self): lhs = FrozenNode("Text", title="Some Title", label=['1111']) rhs = FrozenNode("Text", title=None, label=['1111']) result = dict(difftree.changes_between(lhs, rhs)) self.assertEqual(result['1111'], { 'title': [('delete', 0, 10)], 'op': 'modified' })
def generate_diff(left_xml, right_xml): """ Given two full RegML trees, generate a dictionary of changes between the two in the style of regulations-parser. This wraps regulatons-parser's changes_between() function. """ left_tree = build_reg_tree(left_xml) right_tree = build_reg_tree(right_xml) diff = dict(changes_between(FrozenNode.from_node(left_tree), FrozenNode.from_node(right_tree))) return diff
def generate_diff(left_xml, right_xml): """ Given two full RegML trees, generate a dictionary of changes between the two in the style of regulations-parser. This wraps regulatons-parser's changes_between() function. """ left_tree = build_reg_tree(left_xml) right_tree = build_reg_tree(right_xml) diff = dict( changes_between(FrozenNode.from_node(left_tree), FrozenNode.from_node(right_tree))) return diff
def test_child_added(self): """We should include child_ops if children were added""" lhs = FrozenNode("Root", label=['1111'], children=[ FrozenNode("Child1", label=['1111', 'a'])]) new_child = FrozenNode("Child2", label=['1111', 'b']) rhs = lhs.clone(children=lhs.children + (new_child,)) result = dict(difftree.changes_between(lhs, rhs)) self.assertEqual( result['1111'], {'op': 'modified', 'child_ops': [('equal', 0, 1), # 1111-a ('insert', 1, ('1111-b',))]})
def test_child_order(self): """We should include child_ops if the order of children changed""" lhs = FrozenNode("Root", label=['1111'], children=[ FrozenNode("Child1", label=['1111', 'a']), FrozenNode("Child2", label=['1111', 'b'])]) rhs = lhs.clone(children=list(reversed(lhs.children))) result = dict(difftree.changes_between(lhs, rhs)) self.assertEqual( result['1111'], # Note that these ops could change in other versions of difflib. {'op': 'modified', 'child_ops': [('insert', 0, ('1111-b',)), ('equal', 0, 1), # 1111-a ('delete', 1, 2)]})
def test_child_removed_with_edit(self): """We should include child_ops if children were modified and the parent's text was modified""" lhs = FrozenNode("Root", label=['1111'], children=[ FrozenNode("Child1", label=['1111', 'a']), FrozenNode("Child2", label=['1111', 'b'])]) rhs = lhs.clone(children=lhs.children[:1], text="Root modified") result = dict(difftree.changes_between(lhs, rhs)) self.assertEqual( result['1111'], {'op': 'modified', 'text': [('insert', len("Root"), " modified")], 'child_ops': [('equal', 0, 1), # 1111-a ('delete', 1, 2)]})
def test_subparts(self): """ Create a tree with no subparts, then add subparts. """ title = u"Regulation Title" sect1_title = u"§ 204.1 First Section" sect1 = u"(a) I believe this is (b) the best section " sect2_title = u"§ 204.2 Second Section" sect2 = u"Some sections \ndon't have \ndepth at all." old_text = "\n".join([title, sect1_title, sect1, sect2_title, sect2]) older = reg_text.build_reg_text_tree(old_text, 204) ntitle = u"Regulation Title" nsubpart_a = u"Subpart A—First subpart" nsect1_title = u"§ 204.1 First Section" nsect1 = u"(a) I believe this is (b) the best section " nsubpart_b = u"Subpart B—Second subpart" nsect2_title = u"§ 204.2 Second Section" nsect2 = u"Some sections \ndon't have \ndepth at all." new_text = "\n".join([ ntitle, nsubpart_a, nsect1_title, nsect1, nsubpart_b, nsect2_title, nsect2 ]) newer = reg_text.build_reg_text_tree(new_text, 204) result = dict( difftree.changes_between(FrozenNode.from_node(older), FrozenNode.from_node(newer))) self.assertEquals( result['204-Subpart-A'], { "node": { "text": u"", "node_type": u"subpart", "tagged_text": None, "label": ("204", "Subpart", "A"), "child_labels": ("204-1", ), "title": u"First subpart" }, "op": "added" }) self.assertTrue('204-Subpart-B' in result) self.assertEquals(result['204-Subpart'], {"op": "deleted"}) # Sections shouldn't have changed, though self.assertFalse('204-1' in result) self.assertFalse('204-2' in result)
def test_child_added(self): """We should include child_ops if children were added""" lhs = FrozenNode("Root", label=['1111'], children=[FrozenNode("Child1", label=['1111', 'a'])]) new_child = FrozenNode("Child2", label=['1111', 'b']) rhs = lhs.clone(children=lhs.children + (new_child, )) result = dict(difftree.changes_between(lhs, rhs)) self.assertEqual( result['1111'], { 'op': 'modified', 'child_ops': [ ('equal', 0, 1), # 1111-a ('insert', 1, ('1111-b', )) ] })
def test_subparts(self): """ Create a tree with no subparts, then add subparts. """ title = u"Regulation Title" sect1_title = u"§ 204.1 First Section" sect1 = u"(a) I believe this is (b) the best section " sect2_title = u"§ 204.2 Second Section" sect2 = u"Some sections \ndon't have \ndepth at all." old_text = "\n".join([title, sect1_title, sect1, sect2_title, sect2]) older = reg_text.build_reg_text_tree(old_text, 204) ntitle = u"Regulation Title" nsubpart_a = u"Subpart A—First subpart" nsect1_title = u"§ 204.1 First Section" nsect1 = u"(a) I believe this is (b) the best section " nsubpart_b = u"Subpart B—Second subpart" nsect2_title = u"§ 204.2 Second Section" nsect2 = u"Some sections \ndon't have \ndepth at all." new_text = "\n".join([ntitle, nsubpart_a, nsect1_title, nsect1, nsubpart_b, nsect2_title, nsect2]) newer = reg_text.build_reg_text_tree(new_text, 204) result = dict(difftree.changes_between(FrozenNode.from_node(older), FrozenNode.from_node(newer))) self.assertEquals( result["204-Subpart-A"], { "node": { "text": u"", "node_type": u"subpart", "tagged_text": None, "label": ("204", "Subpart", "A"), "child_labels": ("204-1",), "title": u"First subpart", }, "op": "added", }, ) self.assertTrue("204-Subpart-B" in result) self.assertEquals(result["204-Subpart"], {"op": "deleted"}) # Sections shouldn't have changed, though self.assertFalse("204-1" in result) self.assertFalse("204-2" in result)
def test_subparts(self): """ Create a tree with no subparts, then add subparts. """ old_tree = FrozenNode(title="Regulation Title", label=['204'], children=[ FrozenNode(node_type='emptypart', label=['204', 'Subpart'], children=[ FrozenNode(title=u"§ 204.1 First Section", label=['204', '1'], children=[ FrozenNode(text="(a) I believe this is the best section", label=['204', '1', 'a'])]), FrozenNode(title=u"§ 204.2 Second Section", label=['204', '2'], text=u"Some sections \ndon't have \ndepth at all.") ])]) new_tree = FrozenNode(title="Regulation Title", label=['204'], children=[ FrozenNode(node_type='subpart', label=['204', 'Subpart', 'A'], title=u"Subpart A—First subpart", children=[ FrozenNode(title=u"§ 204.1 First Section", label=['204', '1'], children=[ FrozenNode(text="(a) I believe this is the best section", label=['204', '1', 'a'])])]), FrozenNode(node_type='subpart', label=['204', 'Subpart', 'B'], title=u"Subpart B—Second subpart", children=[ FrozenNode(title=u"§ 204.2 Second Section", label=['204', '2'], text=u"Some sections \ndon't have \ndepth at all.") ])]) result = dict(difftree.changes_between(old_tree, new_tree)) self.assertEquals( result['204-Subpart-A'], {"node": { "text": u"", "node_type": u"subpart", "tagged_text": None, "label": ("204", "Subpart", "A"), "child_labels": ("204-1",), "title": u"Subpart A—First subpart"}, "op": "added"}) self.assertTrue('204-Subpart-B' in result) self.assertEquals(result['204-Subpart'], {"op": "deleted"}) # Sections shouldn't have changed, though self.assertFalse('204-1' in result) self.assertFalse('204-2' in result)
def test_child_removed_with_edit(self): """We should include child_ops if children were modified and the parent's text was modified""" lhs = FrozenNode("Root", label=['1111'], children=[ FrozenNode("Child1", label=['1111', 'a']), FrozenNode("Child2", label=['1111', 'b']) ]) rhs = lhs.clone(children=lhs.children[:1], text="Root modified") result = dict(difftree.changes_between(lhs, rhs)) self.assertEqual( result['1111'], { 'op': 'modified', 'text': [('insert', len("Root"), " modified")], 'child_ops': [ ('equal', 0, 1), # 1111-a ('delete', 1, 2) ] })
def diffs(cfr_title, cfr_part): """Construct diffs between known trees.""" tree_dir = entry.FrozenTree(cfr_title, cfr_part) diff_dir = entry.Diff(cfr_title, cfr_part) pairs = [(lhs, rhs) for lhs in tree_dir for rhs in tree_dir] deps = dependency.Graph() for lhs_id, rhs_id in pairs: deps.add(diff_dir / lhs_id / rhs_id, tree_dir / lhs_id) deps.add(diff_dir / lhs_id / rhs_id, tree_dir / rhs_id) trees = {} for lhs_id, rhs_id in pairs: path = diff_dir / lhs_id / rhs_id deps.validate_for(path) if deps.is_stale(path): if lhs_id not in trees: trees[lhs_id] = (tree_dir / lhs_id).read() if rhs_id not in trees: trees[rhs_id] = (tree_dir / rhs_id).read() path.write(dict(changes_between(trees[lhs_id], trees[rhs_id])))
def test_child_order(self): """We should include child_ops if the order of children changed""" lhs = FrozenNode("Root", label=['1111'], children=[ FrozenNode("Child1", label=['1111', 'a']), FrozenNode("Child2", label=['1111', 'b']) ]) rhs = lhs.clone(children=list(reversed(lhs.children))) result = dict(difftree.changes_between(lhs, rhs)) self.assertEqual( result['1111'], # Note that these ops could change in other versions of difflib. { 'op': 'modified', 'child_ops': [ ('insert', 0, ('1111-b', )), ('equal', 0, 1), # 1111-a ('delete', 1, 2) ] })
def diffs(cfr_title, cfr_part): """Construct diffs between known trees.""" logger.info("Build diffs - %s Part %s", cfr_title, cfr_part) tree_dir = entry.FrozenTree(cfr_title, cfr_part) diff_dir = entry.Diff(cfr_title, cfr_part) pairs = [(l.path[-1], r.path[-1]) for l in tree_dir.sub_entries() for r in tree_dir.sub_entries()] deps = dependency.Graph() for lhs_id, rhs_id in pairs: deps.add(diff_dir / lhs_id / rhs_id, tree_dir / lhs_id) deps.add(diff_dir / lhs_id / rhs_id, tree_dir / rhs_id) trees = {} for lhs_id, rhs_id in pairs: path = diff_dir / lhs_id / rhs_id deps.validate_for(path) if deps.is_stale(path): if lhs_id not in trees: trees[lhs_id] = (tree_dir / lhs_id).read() if rhs_id not in trees: trees[rhs_id] = (tree_dir / rhs_id).read() path.write(dict(changes_between(trees[lhs_id], trees[rhs_id])))
def diffs(cfr_title, cfr_part): """Construct diffs between known trees.""" logger.info("Build diffs - %s Part %s", cfr_title, cfr_part) tree_dir = entry.FrozenTree(cfr_title, cfr_part) diff_dir = entry.Diff(cfr_title, cfr_part) pairs = [(lhs, rhs) for lhs in tree_dir for rhs in tree_dir] deps = dependency.Graph() for lhs_id, rhs_id in pairs: deps.add(diff_dir / lhs_id / rhs_id, tree_dir / lhs_id) deps.add(diff_dir / lhs_id / rhs_id, tree_dir / rhs_id) trees = {} for lhs_id, rhs_id in pairs: path = diff_dir / lhs_id / rhs_id deps.validate_for(path) if deps.is_stale(path): if lhs_id not in trees: trees[lhs_id] = (tree_dir / lhs_id).read() if rhs_id not in trees: trees[rhs_id] = (tree_dir / rhs_id).read() path.write(dict(changes_between(trees[lhs_id], trees[rhs_id])))
def test_subparts(self): """ Create a tree with no subparts, then add subparts. """ old_tree = FrozenNode( title="Regulation Title", label=['204'], children=[ FrozenNode( node_type='emptypart', label=['204', 'Subpart'], children=[ FrozenNode( title=u"§ 204.1 First Section", label=['204', '1'], children=[ FrozenNode( text= "(a) I believe this is the best section", label=['204', '1', 'a']) ]), FrozenNode( title=u"§ 204.2 Second Section", label=['204', '2'], text=u"Some sections \ndon't have \ndepth at all.") ]) ]) new_tree = FrozenNode( title="Regulation Title", label=['204'], children=[ FrozenNode( node_type='subpart', label=['204', 'Subpart', 'A'], title=u"Subpart A—First subpart", children=[ FrozenNode( title=u"§ 204.1 First Section", label=['204', '1'], children=[ FrozenNode( text= "(a) I believe this is the best section", label=['204', '1', 'a']) ]) ]), FrozenNode( node_type='subpart', label=['204', 'Subpart', 'B'], title=u"Subpart B—Second subpart", children=[ FrozenNode( title=u"§ 204.2 Second Section", label=['204', '2'], text=u"Some sections \ndon't have \ndepth at all.") ]) ]) result = dict(difftree.changes_between(old_tree, new_tree)) self.assertEquals( result['204-Subpart-A'], { "node": { "text": u"", "node_type": u"subpart", "tagged_text": None, "label": ("204", "Subpart", "A"), "child_labels": ("204-1", ), "title": u"Subpart A—First subpart" }, "op": "added" }) self.assertTrue('204-Subpart-B' in result) self.assertEquals(result['204-Subpart'], {"op": "deleted"}) # Sections shouldn't have changed, though self.assertFalse('204-1' in result) self.assertFalse('204-2' in result)
def test_whitespace_comparison(self): """We shouldn't trigger diffs for whitespace changes""" lhs = FrozenNode(u"Some\t\nthing", label=['123']) rhs = lhs.clone(text=u"Some\u2009 thing") # thin-space self.assertEqual(difftree.changes_between(lhs, rhs), [])
def test_title_disappears(self): lhs = FrozenNode("Text", title="Some Title", label=["1111"]) rhs = FrozenNode("Text", title=None, label=["1111"]) result = dict(difftree.changes_between(lhs, rhs)) self.assertEqual(result["1111"], {"title": [("delete", 0, 10)], "op": "modified"})
# this used to assume implicitly that if gen-diffs was not specified it was # True; changed it to explicit check if args.generate_diffs: all_versions = {doc_number: reg_tree} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = new_tree builder.doc_number = version builder.write_regulation(new_tree) layer_cache.invalidate_by_notice(last_notice) builder.gen_and_write_layers(new_tree, act_title_and_section, layer_cache, notices) layer_cache.replace_using(new_tree) # convert to frozen trees for doc in all_versions: all_versions[doc] = FrozenNode.from_node(all_versions[doc]) # now build diffs - include "empty" diffs comparing a version to itself for lhs_version, lhs_tree in all_versions.iteritems(): for rhs_version, rhs_tree in all_versions.iteritems(): changes = checkpointer.checkpoint( "-".join(["diff", lhs_version, rhs_version]), lambda: dict(changes_between(lhs_tree, rhs_tree))) builder.writer.diff( reg_tree.label_id(), lhs_version, rhs_version ).write(changes)