def test_read_only_cache_ignores_modifications(self): """When cache is marked as read_only, add() and delete() calls are ignored.""" r1 = PudlResourceKey("a", "b", "r1") r2 = PudlResourceKey("a", "b", "r2") self.cache_1.add(r1, b"xxx") self.cache_2.add(r2, b"yyy") self.assertTrue(self.cache_1.contains(r1)) self.assertTrue(self.cache_2.contains(r2)) lc = resource_cache.LayeredCache(self.cache_1, self.cache_2, read_only=True) self.assertTrue(lc.contains(r1)) self.assertTrue(lc.contains(r2)) lc.delete(r1) lc.delete(r2) self.assertTrue(lc.contains(r1)) self.assertTrue(lc.contains(r2)) self.assertTrue(self.cache_1.contains(r1)) self.assertTrue(self.cache_2.contains(r2)) r_new = PudlResourceKey("a", "b", "new") lc.add(r_new, b"xyz") self.assertFalse(lc.contains(r_new)) self.assertFalse(self.cache_1.contains(r_new)) self.assertFalse(self.cache_2.contains(r_new))
def test_get_resources_filtering(self): """Verifies correct operation of get_resources().""" self.assertEqual([ PudlResourceKey("epacems", "123", "first-red"), PudlResourceKey("epacems", "123", "second-blue") ], list(self.descriptor.get_resources())) self.assertEqual([PudlResourceKey("epacems", "123", "first-red")], list(self.descriptor.get_resources(color="red"))) self.assertEqual( [], list(self.descriptor.get_resources(flavor="blueberry")))
def test_add_single_resource(self): """Adding resource has the expected effect on subsequent get() and contains() calls.""" res = PudlResourceKey("ds", "doi", "file.txt") self.assertFalse(self.cache.contains(res)) self.cache.add(res, b"blah") self.assertTrue(self.cache.contains(res)) self.assertEqual(b"blah", self.cache.get(res))
def test_get_resource(self): """Tests that get_resource() calls the expected http request and gives back the content.""" responses.add(responses.GET, "http://localhost/first", body="blah") res = self.fetcher.get_resource( PudlResourceKey("epacems", self.PROD_EPACEMS_DOI, "first")) self.assertEqual(b"blah", res)
def test_add_with_no_layers_does_nothing(self): """When add() is called on cache with no layers nothing happens.""" res = PudlResourceKey("a", "b", "c") self.assertFalse(self.layered_cache.contains(res)) self.layered_cache.add(res, b"sample") self.assertFalse(self.layered_cache.contains(res)) self.layered_cache.delete(res)
def test_get_resource_with_invalid_checksum(self): """Retrieving resource where content does nto match the checksum will throw ChecksumMismatch.""" responses.add(responses.GET, "http://localhost/first", body="wrongContent") res = PudlResourceKey("epacems", self.PROD_EPACEMS_DOI, "first") self.assertRaises(datastore.ChecksumMismatch, self.fetcher.get_resource, res)
def test_deletion(self): """Deletion of resources has the expected effect on subsequent get()and contains() calls.""" res = PudlResourceKey("a", "b", "c") self.assertFalse(self.cache.contains(res)) self.cache.add(res, b"sampleContents") self.assertTrue(self.cache.contains(res)) self.cache.delete(res) self.assertFalse(self.cache.contains(res))
def test_that_two_cache_objects_share_storage(self): """Two LocalFileCache instances with the same path share the object storage.""" second_cache = resource_cache.LocalFileCache(Path(self.test_dir)) res = PudlResourceKey("dataset", "doi", "file.txt") self.assertFalse(self.cache.contains(res)) self.assertFalse(second_cache.contains(res)) self.cache.add(res, b"testContents") self.assertTrue(self.cache.contains(res)) self.assertTrue(second_cache.contains(res)) self.assertEqual(b"testContents", second_cache.get(res))
def test_add_to_first_layer(self): """Adding to layered cache by default stores entires in the first layer.""" self.layered_cache.add_cache_layer(self.cache_1) self.layered_cache.add_cache_layer(self.cache_2) res = PudlResourceKey("a", "b", "x.txt") self.assertFalse(self.layered_cache.contains(res)) self.layered_cache.add(res, b"sampleContent") self.assertTrue(self.layered_cache.contains(res)) self.assertTrue(self.cache_1.contains(res)) self.assertFalse(self.cache_2.contains(res))
def get_datapackage_descriptor(self, dataset: str) -> DatapackageDescriptor: """Fetch datapackage descriptor for given dataset either from cache or from zenodo.""" doi = self._zenodo_fetcher.get_doi(dataset) if doi not in self._datapackage_descriptors: res = PudlResourceKey(dataset, doi, "datapackage.json") if self._cache.contains(res): self._datapackage_descriptors[doi] = DatapackageDescriptor( json.loads(self._cache.get(res).decode('utf-8')), dataset=dataset, doi=doi) else: desc = self._zenodo_fetcher.get_descriptor(dataset) self._datapackage_descriptors[doi] = desc self._cache.add(res, bytes(desc.get_json_string(), "utf-8")) return self._datapackage_descriptors[doi]
def get_resources(self, name: str = None, **filters: Any) -> Iterator[PudlResourceKey]: """Returns series of PudlResourceKey identifiers for matching resources. Args: name (str): if specified, find resource(s) with this name. filters (dict): if specified, find resoure(s) matching these key=value constraints. The constraints are matched against the 'parts' field of the resource entry in the datapackage.json. """ for res in self.datapackage_json["resources"]: if name and res["name"] != name: continue if self._matches(res, **filters): yield PudlResourceKey( dataset=self.dataset, doi=self.doi, name=res["name"])
def test_read_only_add_and_delete_do_nothing(self): """When cache is in read_only mode, add() and delete() calls should be ignored.""" res = PudlResourceKey("a", "b", "c") ro_cache = resource_cache.LocalFileCache(Path(self.test_dir), read_only=True) self.assertTrue(ro_cache.is_read_only()) ro_cache.add(res, b"sample") self.assertFalse(ro_cache.contains(res)) # Use read-write cache to insert resource self.cache.add(res, b"sample") self.assertFalse(self.cache.is_read_only()) self.assertTrue(ro_cache.contains(res)) # Deleting via ro cache should not happen ro_cache.delete(res) self.assertTrue(ro_cache.contains(res))
def test_read_only_layers_skipped_when_adding(self): """When add() is called, layers that are marked as read_only are skipped.""" c1 = resource_cache.LocalFileCache(self.test_dir_1, read_only=True) c2 = resource_cache.LocalFileCache(self.test_dir_2) lc = resource_cache.LayeredCache(c1, c2) res = PudlResourceKey("a", "b", "c") self.assertFalse(lc.contains(res)) self.assertFalse(c1.contains(res)) self.assertFalse(c2.contains(res)) lc.add(res, b"test") self.assertTrue(lc.contains(res)) self.assertFalse(c1.contains(res)) self.assertTrue(c2.contains(res)) lc.delete(res) self.assertFalse(lc.contains(res)) self.assertFalse(c1.contains(res)) self.assertFalse(c2.contains(res))
def test_get_uses_innermost_layer(self): """Resource is retrieved from the leftmost layer that contains it.""" res = PudlResourceKey("a", "b", "x.txt") self.layered_cache.add_cache_layer(self.cache_1) self.layered_cache.add_cache_layer(self.cache_2) # self.cache_1.add(res, "firstLayer") self.cache_2.add(res, b"secondLayer") self.assertEqual(b"secondLayer", self.layered_cache.get(res)) self.cache_1.add(res, b"firstLayer") self.assertEqual(b"firstLayer", self.layered_cache.get(res)) # Set on layered cache updates innermost layer self.layered_cache.add(res, b"newContents") self.assertEqual(b"newContents", self.layered_cache.get(res)) self.assertEqual(b"newContents", self.cache_1.get(res)) self.assertEqual(b"secondLayer", self.cache_2.get(res)) # Deletion also only affects innermost layer self.layered_cache.delete(res) self.assertTrue(self.layered_cache.contains(res)) self.assertFalse(self.cache_1.contains(res)) self.assertTrue(self.cache_2.contains(res)) self.assertEqual(b"secondLayer", self.layered_cache.get(res))
def test_get_resources_by_name(self): """Verifies that get_resources() work when name is specified.""" self.assertEqual( [PudlResourceKey("epacems", "123", "second-blue")], list(self.descriptor.get_resources(name="second-blue")))
def get_resource_key(self, dataset: str, name: str) -> PudlResourceKey: """Returns PudlResourceKey for given resource.""" return PudlResourceKey(dataset, self._dataset_to_doi[dataset], name)
def test_get_resource_with_nonexistent_resource_fails(self): """If resource does not exist, get_resource() throws KeyError.""" res = PudlResourceKey("epacems", self.PROD_EPACEMS_DOI, "nonexistent") self.assertRaises(KeyError, self.fetcher.get_resource, res)
def test_get_resource_key(self): """Tests normal operation of get_resource_key().""" self.assertEqual( PudlResourceKey("epacems", self.PROD_EPACEMS_DOI, "blob.zip"), self.fetcher.get_resource_key("epacems", "blob.zip"))