def test_deletes_dependencies(tmpdir_setup): graph = dependency.Graph() graph.add('a', 'b') assert len(graph.dependencies('a')) == 1 graph = dependency.Graph() assert len(graph.dependencies('a')) == 1 CliRunner().invoke(clear) graph = dependency.Graph() assert len(graph.dependencies('a')) == 0
def test_stale_layers(self): """We should have dependencies between all of the layers and their associated trees. We should also tie the meta layer to the version""" configured_layers = {'cfr': {'keyterms': None, 'other': None}} with self.cli.isolated_filesystem(), patch.dict( layers.LAYER_CLASSES, configured_layers): version_entry = entry.Version(111, 22, 'aaa') version_entry.write(Version('aaa', date.today(), date.today())) tree_entry = entry.Tree(111, 22, 'aaa') # Use list() to instantiate self.assertRaises(dependency.Missing, list, layers.stale_layers(tree_entry, 'cfr')) entry.Entry('tree', 111, 22, 'bbb').write(b'') # wrong version self.assertRaises(dependency.Missing, list, layers.stale_layers(tree_entry, 'cfr')) entry.Entry('tree', 111, 22, 'aaa').write(b'') six.assertCountEqual(self, layers.stale_layers(tree_entry, 'cfr'), ['keyterms', 'other']) self.assertIn( str(version_entry), dependency.Graph().dependencies( str(entry.Layer.cfr(111, 22, 'aaa', 'meta'))))
def test_dependencies(self, notice_xmls_for_url): """If the xml comes from a local source, we should expect a dependency be present. Otherwise, we should expect no dependency""" cli = CliRunner() self.expect_common_json() notice_xmls_for_url.return_value = [self.example_xml(source='./here')] with cli.isolated_filesystem(): cli.invoke(preprocess_notice, ['1234-5678']) entry_str = str(entry.Notice() / '1234-5678') self.assertIn(entry_str, dependency.Graph()) notice_xmls_for_url.return_value[0].source = 'http://example.com' with cli.isolated_filesystem(): cli.invoke(preprocess_notice, ['1234-5678']) entry_str = str(entry.Notice() / '1234-5678') self.assertNotIn(entry_str, dependency.Graph())
def fetch_annual_edition(cfr_title, cfr_part, year): """Download an annual edition of a regulation""" volume = annual.find_volume(year, cfr_title, cfr_part) xml = volume.find_part_xml(cfr_part).preprocess() annual_entry = entry.Annual(cfr_title, cfr_part, year) annual_entry.write(xml) if xml.source_is_local: dependency.Graph().add(str(annual_entry), xml.source)
def generate_dependencies(version_dir, version_ids, delays_by_version): """Creates a dependency graph and adds all dependencies for input xml and delays between notices""" notice_dir = entry.Notice() deps = dependency.Graph() for version_id in version_ids: deps.add(version_dir / version_id, notice_dir / version_id) for delayed, delay in delays_by_version.items(): deps.add(version_dir / delayed, notice_dir / delay.by) return deps
def preprocess_notice(document_number): """Preprocess notice XML. Either fetch from the Federal Register or read a notice from disk. Apply some common transformations to it and output the resulting file(s). There may be more than one as documents might be split if they have multiple effective dates.""" meta = federalregister.meta_data( document_number, [ "agencies", "docket_ids", "effective_on", "cfr_references", "comments_close_on", "full_text_xml_url", "html_url", "publication_date", "regulation_id_numbers", "volume" ]) notice_xmls = list(notice_xmls_for_url(document_number, meta['full_text_xml_url'])) deps = dependency.Graph() for notice_xml in notice_xmls: notice_xml.published = meta['publication_date'] notice_xml.fr_volume = meta['volume'] if meta.get('html_url'): notice_xml.fr_html_url = meta['html_url'] if meta.get("comments_close_on"): notice_xml.comments_close_on = meta["comments_close_on"] if meta.get('regulation_id_numbers'): notice_xml.rins = meta['regulation_id_numbers'] if meta.get('docket_ids'): notice_xml.docket_ids = meta['docket_ids'] notice_xml.set_agencies(meta.get('agencies', [])) cfr_refs = convert_cfr_refs(meta.get('cfr_references', [])) if cfr_refs: notice_xml.cfr_refs = cfr_refs file_name = document_number if len(notice_xmls) > 1: effective_date = notice_xml.derive_effective_date() file_name = split_doc_num(document_number, effective_date.isoformat()) elif meta.get('effective_on'): notice_xml.effective = meta['effective_on'] notice_xml.version_id = file_name notice_xml.derive_where_needed() notice_entry = entry.Notice(file_name) notice_entry.write(notice_xml) if notice_xml.source_is_local: deps.add(str(notice_entry), notice_xml.source)
def test_dependencies_remote(self, notice_xmls_for_url): """If the xml comes from a remote source, we should not see a dependency""" cli = CliRunner() self.expect_common_json() notice_xmls_for_url.return_value = [self.example_xml(source='./here')] notice_xmls_for_url.return_value[0].source = 'http://example.com' with cli.isolated_filesystem(): cli.invoke(preprocess_notice, ['1234-5678']) entry_str = str(entry.Notice() / '1234-5678') assert len(dependency.Graph().dependencies(entry_str)) == 0
def is_stale(cfr_title, cfr_part, version_id): """Modify and process dependency graph related to a single SxS layer""" deps = dependency.Graph() layer_entry = entry.Layer(cfr_title, cfr_part, version_id, 'analyses') # Layers depend on their associated tree deps.add(layer_entry, entry.Tree(cfr_title, cfr_part, version_id)) # And on all notices which came before for sxs_entry in previous_sxs(cfr_title, cfr_part, version_id): deps.add(layer_entry, sxs_entry) deps.validate_for(layer_entry) return deps.is_stale(layer_entry)
def notice_preamble(doc_number): """Pull down and parse the preamble from this notice.""" logger.info("Parsing Preamble for %s", doc_number) preamble_path = entry.Preamble(convert_id(doc_number)) notice_path = entry.Notice(doc_number) deps = dependency.Graph() deps.add(preamble_path, notice_path) deps.validate_for(preamble_path) if deps.is_stale(preamble_path): preamble = parse_preamble(notice_path.read()) preamble_path.write(preamble)
def test_derived_from_rules(self): """Should filter a set of version ids to only those with a dependency on changes derived from a rule""" with self.cli.isolated_filesystem(): tree_dir = entry.Tree('12', '1000') deps = dependency.Graph() deps.add(tree_dir / 111, entry.Annual(12, 1000, 2001)) deps.add(tree_dir / 222, entry.RuleChanges(222)) deps.add(tree_dir / 333, entry.RuleChanges(333)) deps.add(tree_dir / 333, entry.Version(333)) derived = fill_with_rules.derived_from_rules( ['111', '222', '333', '444'], deps, tree_dir) self.assertEqual(derived, ['222', '333'])
def test_is_derived(): """Should filter version ids to only those with a dependency on changes derived from a rule""" tree_dir = entry.Tree('12', '1000') deps = dependency.Graph() deps.add(tree_dir / 111, entry.Annual(12, 1000, 2001)) deps.add(tree_dir / 222, entry.Notice(222)) deps.add(tree_dir / 333, entry.Notice(333)) deps.add(tree_dir / 333, entry.Version(333)) assert not fill_with_rules.is_derived('111', deps, tree_dir) assert fill_with_rules.is_derived('222', deps, tree_dir) assert fill_with_rules.is_derived('333', deps, tree_dir) assert not fill_with_rules.is_derived('444', deps, tree_dir)
def write_if_stale(notice_xml): """We only want to write out the processed xml if it is "stale", i.e. if its source has changed""" deps = dependency.Graph() notice_entry = entry.Notice(notice_xml.version_id) new_notice = notice_entry not in deps diff_source = notice_xml.source not in deps.dependencies(notice_xml) source_changed = deps.is_stale(notice_entry) if new_notice or diff_source or source_changed: deps.clear_for(notice_entry) deps.add(notice_entry, notice_xml.source) notice_entry.write(notice_xml)
def process_tree_if_needed(cfr_title, cfr_part, version_id): """Creates and writes a regulation tree if the appropriate notice exists""" notice_entry = entry.Notice(version_id) tree_entry = entry.Tree(cfr_title, cfr_part, version_id) deps = dependency.Graph() deps.add(tree_entry, notice_entry) deps.validate_for(tree_entry) if deps.is_stale(tree_entry): notice_xml = notice_entry.read() tree = build_tree(regtext_for_part(notice_xml, cfr_title, cfr_part)) tree_entry.write(tree)
def process_version_if_needed(cfr_title, cfr_part, version_id): """Creates and writes a version struct after validating the Notice has been created""" notice_entry = entry.Notice(version_id) version_entry = entry.Version(cfr_title, cfr_part, version_id) deps = dependency.Graph() deps.add(version_entry, notice_entry) deps.validate_for(version_entry) if deps.is_stale(version_entry): notice_xml = notice_entry.read() version = Version(version_id, notice_xml.effective, notice_xml.fr_citation) version_entry.write(version)
def test_is_derived(self): """Should filter version ids to only those with a dependency on changes derived from a rule""" with self.cli.isolated_filesystem(): tree_dir = entry.Tree('12', '1000') deps = dependency.Graph() deps.add(tree_dir / 111, entry.Annual(12, 1000, 2001)) deps.add(tree_dir / 222, entry.Notice(222)) deps.add(tree_dir / 333, entry.Notice(333)) deps.add(tree_dir / 333, entry.Version(333)) self.assertFalse(fill_with_rules.is_derived('111', deps, tree_dir)) self.assertTrue(fill_with_rules.is_derived('222', deps, tree_dir)) self.assertTrue(fill_with_rules.is_derived('333', deps, tree_dir)) self.assertFalse(fill_with_rules.is_derived('444', deps, tree_dir))
def test_dependencies_serialized(self): """Every instance of dependency.Graph shares a serialized copy of the dependencies""" with self.dependency_graph() as dgraph: dgraph.add(self.depender, self.dependency / '1') dgraph.add(self.depender, self.dependency / '2') six.assertCountEqual( self, dgraph.dependencies(str(self.depender)), [str(self.dependency / 1), str(self.dependency / 2)]) six.assertCountEqual( self, dependency.Graph().dependencies(str(self.depender)), [str(self.dependency / 1), str(self.dependency / 2)])
def dependencies(tree_path, version_ids, cfr_title, cfr_part): """Set up the dependency graph for this regulation. First calculates "gaps" -- versions for which there is no existing tree. In this calculation, we ignore the first version, as we won't be able to build anything for it. Add dependencies for any gaps, tying the output tree to the preceding tree, the version info and the parsed rule""" existing_ids = set(tree_path) gaps = [(prev, curr) for prev, curr in zip(version_ids, version_ids[1:]) if curr not in existing_ids] deps = dependency.Graph() for prev, curr in gaps: deps.add(tree_path / curr, tree_path / prev) deps.add(tree_path / curr, entry.RuleChanges(curr)) deps.add(tree_path / curr, entry.Version(cfr_title, cfr_part, curr)) return deps
def process_if_needed(volume, cfr_part): """Review dependencies; if they're out of date, parse the annual edition into a tree and store that""" version_id = _version_id(volume.year, cfr_part) annual_entry = entry.Annual(volume.title, cfr_part, volume.year) tree_entry = entry.Tree(volume.title, cfr_part, version_id) notice_entry = entry.Notice(version_id) deps = dependency.Graph() deps.add(tree_entry, annual_entry) deps.validate_for(tree_entry) if deps.is_stale(tree_entry): tree = xml_parser.reg_text.build_tree(annual_entry.read().xml) tree_entry.write(tree) notice_entry.write( build_fake_notice(version_id, volume.publication_date, volume.title, cfr_part))
def stale_layers(doc_entry, doc_type): """Return the name of layer dependencies which are now stale. Limit to a particular doc_type""" deps = dependency.Graph() layer_dir = entry.Layer(doc_type, *doc_entry.path) for layer_name in LAYER_CLASSES[doc_type]: # Layers depend on their associated tree deps.add(layer_dir / layer_name, doc_entry) if doc_type == 'cfr': # Meta layer also depends on the version info deps.add(layer_dir / 'meta', entry.Version(*doc_entry.path)) for layer_name in LAYER_CLASSES[doc_type]: layer_entry = layer_dir / layer_name deps.validate_for(layer_entry) if deps.is_stale(layer_entry): yield layer_name
def test_rebuild(self): """Validate that the `rebuild()` method calculates the correct "stale" references""" with CliRunner().isolated_filesystem(): graph = dependency.Graph() path = entry.Entry('path') a, b, c, d = [path / char for char in 'abcd'] # (A, B) -> C -> D graph.add(c, a) graph.add(c, b) graph.add(d, c) # None of the files exist yet; A & B have no dependencies, so they # are stale due to themselves. C & D are stale due either A or B self.assert_rebuilt_state(graph, path, a='a', b='b', c='ab', d='ab') b.write(b'bbb') # B exists now, so dependency errors are only due to A now self.assert_rebuilt_state(graph, path, a='a', b='', c='a', d='a') a.write(b'aaa') # A exists now, too, so C is the bottleneck self.assert_rebuilt_state(graph, path, a='', b='', c='c', d='c') c.write(b'ccc') # Now there's only the final, self-reference self.assert_rebuilt_state(graph, path, a='', b='', c='', d='d') d.write(b'ddd') # Now no one is stale self.assert_rebuilt_state(graph, path, a='', b='', c='', d='') self._touch(a, 1000) # A's been updated. Need to run everything after it self.assert_rebuilt_state(graph, path, a='', b='', c='a', d='a') self._touch(d, 2000) self._touch(c, 3000) # C and D have been updated, but C's been updated after D self.assert_rebuilt_state(graph, path, a='', b='', c='', d='c')
def dependencies(tree_dir, version_dir, versions_with_parents): """Set up the dependency graph for this regulation. First calculates "gaps" -- versions for which there is no existing tree. In this calculation, we ignore the first version, as we won't be able to build anything for it. Add dependencies for any gaps, tying the output tree to the preceding tree, the version info and the parsed rule""" existing_tree_ids = set(tree_dir) versions_with_parents = versions_with_parents[1:] gaps = [(version, parent) for (version, parent) in versions_with_parents if version.identifier not in existing_tree_ids] deps = dependency.Graph() for version, parent in gaps: doc_number = version.identifier deps.add(tree_dir / doc_number, tree_dir / parent.identifier) deps.add(tree_dir / doc_number, entry.Notice(doc_number)) deps.add(tree_dir / doc_number, version_dir / doc_number) return deps
def parse_rule_changes(document_number): """Parse changes present in a single rule. DOCUMENT_NUMBER is the identifier associated with a final rule. If a rule has been split, use the split identifiers, a.k.a. version ids.""" rule_entry = entry.RuleChanges(document_number) notice_entry = entry.Notice(document_number) deps = dependency.Graph() deps.add(rule_entry, notice_entry) deps.validate_for(rule_entry) # We don't check for staleness as we want to always execute when given a # specific file to process notice_xml = notice_entry.read() notice = process_amendments({'cfr_parts': notice_xml.cfr_parts}, notice_xml.xml) rule_entry.write(notice)
def process_if_needed(cfr_title, cfr_part, last_version_list): """Calculate dependencies between input and output files for these annual editions. If an output is missing or out of date, process it""" annual_path = entry.Annual(cfr_title, cfr_part) tree_path = entry.Tree(cfr_title, cfr_part) version_path = entry.Version(cfr_title, cfr_part) deps = dependency.Graph() for last_version in last_version_list: deps.add(tree_path / last_version.version_id, version_path / last_version.version_id) deps.add(tree_path / last_version.version_id, annual_path / last_version.year) for last_version in last_version_list: tree_entry = tree_path / last_version.version_id deps.validate_for(tree_entry) if deps.is_stale(tree_entry): input_entry = annual_path / last_version.year tree = gpo_cfr.builder.build_tree(input_entry.read().xml) tree_entry.write(tree)
def fetch_sxs(document_number): """Fetch and parse Section-by-Section analyses. DOCUMENT_NUMBER is the identifier associated with a final rule. If a rule has been split, use the split identifiers, a.k.a. version ids""" sxs_entry = entry.SxS(document_number) notice_entry = entry.Notice(document_number) deps = dependency.Graph() deps.add(sxs_entry, notice_entry) deps.validate_for(sxs_entry) # We don't check for staleness as we want to always execute when given a # specific file to process # @todo - break apart processing of SxS. We don't need all of the other # fields notice_xml = notice_entry.read() notice_meta = meta_data(document_number, FULL_NOTICE_FIELDS) notice = build_notice(notice_xml.cfr_titles[0], None, notice_meta, xml_to_process=notice_xml.xml)[0] sxs_entry.write(notice)
def diffs(cfr_title, cfr_part): """Construct diffs between known trees.""" logger.info("Build diffs - %s Part %s", cfr_title, cfr_part) tree_dir = entry.FrozenTree(cfr_title, cfr_part) diff_dir = entry.Diff(cfr_title, cfr_part) pairs = [(lhs, rhs) for lhs in tree_dir for rhs in tree_dir] deps = dependency.Graph() for lhs_id, rhs_id in pairs: deps.add(diff_dir / lhs_id / rhs_id, tree_dir / lhs_id) deps.add(diff_dir / lhs_id / rhs_id, tree_dir / rhs_id) trees = {} for lhs_id, rhs_id in pairs: path = diff_dir / lhs_id / rhs_id deps.validate_for(path) if deps.is_stale(path): if lhs_id not in trees: trees[lhs_id] = (tree_dir / lhs_id).read() if rhs_id not in trees: trees[rhs_id] = (tree_dir / rhs_id).read() path.write(dict(changes_between(trees[lhs_id], trees[rhs_id])))
def test_stale_layers(monkeypatch): """We should have dependencies between all of the layers and their associated trees. We should also tie the meta layer to the version""" monkeypatch.setattr(layers, 'LAYER_CLASSES', {'cfr': { 'keyterms': None, 'other': None }}) version_entry = entry.Version(111, 22, 'aaa') version_entry.write(Version('aaa', date.today(), Citation(1, 1))) tree_entry = entry.Tree(111, 22, 'aaa') with pytest.raises(dependency.Missing): layers.stale_layers(tree_entry, 'cfr') entry.Entry('tree', 111, 22, 'bbb').write(b'') # wrong version with pytest.raises(dependency.Missing): layers.stale_layers(tree_entry, 'cfr') entry.Entry('tree', 111, 22, 'aaa').write(b'') assert set(layers.stale_layers(tree_entry, 'cfr')) == {'keyterms', 'other'} assert str(version_entry) in dependency.Graph().dependencies( str(entry.Layer.cfr(111, 22, 'aaa', 'meta')))
def dependency_graph(self): with CliRunner().isolated_filesystem(): path = entry.Entry('path') self.depender = path / 'depender' self.dependency = path / 'dependency' yield dependency.Graph()