def test_metadata_file_attributes(manifest_for_metadata): """ Test that Manifest.metadata_file_attributes returns the correct CacheFileAttributes object and raises the correct error when you ask for a metadata file that does not exist """ mfest = Manifest('/my/cache/dir/', manifest_for_metadata) a_obj = mfest.metadata_file_attributes('a.txt') assert a_obj.url == 'http://my.url.com/path/to/a.txt' assert a_obj.version_id == '12345' assert a_obj.file_hash == 'abcde' expected = safe_system_path('/my/cache/dir/some-project-000/to/a.txt') expected = pathlib.Path(expected).resolve() assert a_obj.local_path == expected b_obj = mfest.metadata_file_attributes('b.txt') assert b_obj.url == 'http://my.other.url.com/different/path/to/b.txt' assert b_obj.version_id == '67890' assert b_obj.file_hash == 'fghijk' expected = safe_system_path('/my/cache/dir/some-project-000/path/to/b.txt') expected = pathlib.Path(expected).resolve() assert b_obj.local_path == expected # test that the correct error is raised when you ask # for a metadata file that does not exist with pytest.raises(ValueError) as context: _ = mfest.metadata_file_attributes('c.txt') msg = "c.txt\nis not in self.metadata_file_names" assert msg in context.value.args[0]
def test_data_file_attributes(manifest_with_data): """ Test that Manifest.data_file_attributes returns the correct CacheFileAttributes object and raises the correct error when you ask for a data file that does not exist """ mfest = Manifest('/my/cache/dir', manifest_with_data) a_obj = mfest.data_file_attributes('a') assert a_obj.url == 'http://my.url.com/myproject/path/to/a.nwb' assert a_obj.version_id == '12345' assert a_obj.file_hash == 'abcde' expected = safe_system_path('/my/cache/dir/myproject-0/path/to/a.nwb') assert a_obj.local_path == pathlib.Path(expected).resolve() b_obj = mfest.data_file_attributes('b') assert b_obj.url == 'http://my.other.url.com/different/path/b.nwb' assert b_obj.version_id == '67890' assert b_obj.file_hash == 'fghijk' expected = safe_system_path('/my/cache/dir/myproject-0/path/b.nwb') assert b_obj.local_path == pathlib.Path(expected).resolve() with pytest.raises(ValueError) as context: _ = mfest.data_file_attributes('c') msg = "file_id: c\nIs not a data file listed in manifest:" assert msg in context.value.args[0]
def test_constructor(meta_json_path): """ Make sure that the Manifest class __init__ runs and raises an error if you give it an unexpected cache_dir """ Manifest('my/cache/dir', meta_json_path) Manifest(pathlib.Path('my/other/cache/dir'), meta_json_path) with pytest.raises(ValueError, match=r"cache_dir must be either a str.*"): Manifest(1234.2, meta_json_path)
def _load_manifest(self, manifest_name: str) -> Manifest: """ Load and return a manifest from this dataset. Parameters ---------- manifest_name: str The name of the manifest to load. Must be an element in self.manifest_file_names Returns ------- Manifest """ if manifest_name not in self.manifest_file_names: raise ValueError(f"manifest: {manifest_name}\n" "is not one of the valid manifest names " "for this dataset:\n" f"{self.manifest_file_names}") filepath = os.path.join(self._cache_dir, manifest_name) if not os.path.exists(filepath): self._download_manifest(manifest_name) with open(filepath) as f: local_manifest = Manifest( cache_dir=self._cache_dir, json_input=f ) with open(self._manifest_last_used, 'w') as out_file: out_file.write(manifest_name) return local_manifest
def test_create_file_attributes(meta_json_path): """ Test that Manifest._create_file_attributes correctly handles input parameters (this is mostly a test of local_path generation) """ mfest = Manifest('/my/cache/dir', meta_json_path) attr = mfest._create_file_attributes('http://my.url.com/path/to/file.txt', '12345', 'aaabbbcccddd') assert isinstance(attr, CacheFileAttributes) assert attr.url == 'http://my.url.com/path/to/file.txt' assert attr.version_id == '12345' assert attr.file_hash == 'aaabbbcccddd' expected_path = '/my/cache/dir/X-Y/to/file.txt' assert attr.local_path == pathlib.Path(expected_path).resolve()
def load_manifest(self, manifest_name: str): """ Load a manifest from this dataset. Parameters ---------- manifest_name: str The name of the manifest to load. Must be an element in self.manifest_file_names """ if manifest_name not in self.manifest_file_names: raise ValueError(f"manifest: {manifest_name}\n" "is not one of the valid manifest names " "for this dataset:\n" f"{self.manifest_file_names}") filepath = os.path.join(self._cache_dir, manifest_name) if not os.path.exists(filepath): self._download_manifest(manifest_name) with open(filepath) as f: self._manifest = Manifest(cache_dir=self._cache_dir, json_input=f)
def _load_manifest(self, manifest_name: str, use_static_project_dir: bool = False) -> Manifest: """ Load and return a manifest from this dataset. Parameters ---------- manifest_name: str The name of the manifest to load. Must be an element in self.manifest_file_names use_static_project_dir: bool When determining what the local path of a remote resource (data or metadata file) should be, the Manifest class will typically create a versioned project subdirectory under the user provided `cache_dir` (e.g. f"{cache_dir}/{project_name}-{manifest_version}") to allow the possibility of multiple manifest (and data) versions to be used. In certain cases, like when using a project's s3 bucket directly as the cache_dir, the project directory name needs to be static (e.g. f"{cache_dir}/{project_name}"). When set to True, the Manifest class will use a static project directory to determine local paths for remote resources. Defaults to False. Returns ------- Manifest """ if manifest_name not in self.manifest_file_names: raise ValueError( f"Manifest to load ({manifest_name}) is not one of the " "valid manifest names for this dataset. Valid names include:\n" f"{self.manifest_file_names}") if use_static_project_dir: manifest_path = os.path.join(self._cache_dir, self.project_name, "manifests", manifest_name) else: manifest_path = os.path.join(self._cache_dir, manifest_name) with open(manifest_path, "r") as f: local_manifest = Manifest( cache_dir=self._cache_dir, json_input=f, use_static_project_dir=use_static_project_dir) return local_manifest
def test_file_attribute_errors(meta_json_path): """ Test that Manifest raises the correct error if you try to get file attributes before loading a manifest.json """ mfest = Manifest("/my/cache/dir", meta_json_path) with pytest.raises(ValueError, match=r".* not in self.metadata_file_names"): mfest.metadata_file_attributes('some_file.txt') with pytest.raises(ValueError, match=r".* not a data file listed in manifest"): mfest.data_file_attributes('other_file.txt')
def test_windows_path_to_isilon(monkeypatch, tmpdir): """ This test is just meant to verify on Windows CI instances that, if a path to the `/allen/` shared file store is used as cache_dir, the path to files will come out useful (i.e. without any spurious C:/ prepended as in AllenSDK issue #1964 """ cache_dir = Path(tmpdir) manifest_1 = { 'manifest_version': '1', 'metadata_file_id_column_name': 'file_id', 'data_pipeline': 'placeholder', 'project_name': 'my-project', 'metadata_files': { 'a.csv': { 'url': 'http://www.junk.com/path/to/a.csv', # noqa: E501 'version_id': '1111', 'file_hash': 'abcde' }, 'b.csv': { 'url': 'http://silly.com/path/to/b.csv', # noqa: E501 'version_id': '2222', 'file_hash': 'fghijk' } }, 'data_files': { 'data_1': { 'url': 'http://www.junk.com/data/path/data.csv', # noqa: E501 'version_id': '1111', 'file_hash': 'lmnopqrst' } } } manifest_path = tmpdir / "manifest.json" with open(manifest_path, "w") as f: json.dump(manifest_1, f) def dummy_file_exists(self, m): return True # we do not want paths to `/allen` to be resolved to # a local drive on the user's machine bad_windows_pattern = re.compile('^[A-Z]\:') # noqa: W605 # make sure pattern is correctly formulated m = bad_windows_pattern.search('C:\\a\windows\path') # noqa: W605 assert m is not None with monkeypatch.context() as ctx: class TestCloudCache(CloudCacheBase): def _download_file(self, m, o): pass def _download_manifest(self, m, o): pass def _list_all_manifests(self): pass ctx.setattr(TestCloudCache, '_file_exists', dummy_file_exists) cache = TestCloudCache(cache_dir, 'proj') cache._manifest = Manifest(cache_dir, json_input=manifest_path) m_path = cache.metadata_path('a.csv') assert bad_windows_pattern.match(str(m_path)) is None d_path = cache.data_path('data_1') assert bad_windows_pattern.match(str(d_path)) is None
class CloudCacheBase(ABC): """ A class to handle the downloading and accessing of data served from a cloud storage system Parameters ---------- cache_dir: str or pathlib.Path Path to the directory where data will be stored on the local system project_name: str the name of the project this cache is supposed to access. This will be the root directory for all files stored in the bucket. """ _bucket_name = None def __init__(self, cache_dir, project_name): os.makedirs(cache_dir, exist_ok=True) self._manifest = None self._cache_dir = cache_dir self._project_name = project_name self._manifest_file_names = self._list_all_manifests() @abstractmethod def _list_all_manifests(self) -> list: """ Return a list of all of the file names of the manifests associated with this dataset """ raise NotImplementedError() @property def latest_manifest_file(self) -> str: """parses available manifest files for semver string and returns the latest one self.manifest_file_names are assumed to be of the form '<anything>_v<semver_str>.json' Returns ------- str the filename whose semver string is the latest one """ vstrs = [ s.split(".json")[0].split("_v")[-1] for s in self.manifest_file_names ] versions = [semver.VersionInfo.parse(v) for v in vstrs] imax = versions.index(max(versions)) return self.manifest_file_names[imax] def load_latest_manifest(self): self.load_manifest(self.latest_manifest_file) @abstractmethod def _download_manifest(self, manifest_name: str): """ Download a manifest from the dataset Parameters ---------- manifest_name: str The name of the manifest to load. Must be an element in self.manifest_file_names """ raise NotImplementedError() @abstractmethod def _download_file(self, file_attributes: CacheFileAttributes) -> bool: """ Check if a file exists and is in the expected state. If it is, return True. If it is not, download the file, creating the directory where the file is to be stored if necessary. If the download is successful, return True. If the download fails (file hash does not match expectation), return False. Parameters ---------- file_attributes: CacheFileAttributes Describes the file to download Returns ------- None Raises ------ RuntimeError If the path to the directory where the file is to be saved points to something that is not a directory. RuntimeError If it is not able to successfully download the file after 10 iterations """ raise NotImplementedError() @property def project_name(self) -> str: """ The name of the project that this cache is accessing """ return self._project_name @property def manifest_prefix(self) -> str: """ On-line prefix for manifest files """ return f'{self.project_name}/manifests/' @property def file_id_column(self) -> str: """ The column in the metadata files used to uniquely identify data files """ return self._manifest.file_id_column @property def version(self) -> str: """ The version of the dataset currently loaded """ return self._manifest.version @property def metadata_file_names(self) -> list: """ List of metadata file names associated with this dataset """ return self._manifest.metadata_file_names @property def manifest_file_names(self) -> list: """ Sorted list of manifest file names associated with this dataset """ return copy.deepcopy(self._manifest_file_names) def load_manifest(self, manifest_name: str): """ Load a manifest from this dataset. Parameters ---------- manifest_name: str The name of the manifest to load. Must be an element in self.manifest_file_names """ if manifest_name not in self.manifest_file_names: raise ValueError(f"manifest: {manifest_name}\n" "is not one of the valid manifest names " "for this dataset:\n" f"{self.manifest_file_names}") filepath = os.path.join(self._cache_dir, manifest_name) if not os.path.exists(filepath): self._download_manifest(manifest_name) with open(filepath) as f: self._manifest = Manifest(cache_dir=self._cache_dir, json_input=f) def _file_exists(self, file_attributes: CacheFileAttributes) -> bool: """ Given a CacheFileAttributes describing a file, assess whether or not that file exists locally and is valid (i.e. has the expected file hash) Parameters ---------- file_attributes: CacheFileAttributes Description of the file to look for Returns ------- bool True if the file exists and is valid; False otherwise Raises ----- RuntimeError If file_attributes.local_path exists but is not a file. It would be unclear how the cache should proceed in this case. """ if not file_attributes.local_path.exists(): return False if not file_attributes.local_path.is_file(): raise RuntimeError(f"{file_attributes.local_path}\n" "exists, but is not a file;\n" "unsure how to proceed") full_path = file_attributes.local_path.resolve() test_checksum = file_hash_from_path(full_path) if test_checksum != file_attributes.file_hash: return False return True def data_path(self, file_id) -> dict: """ Return the local path to a data file, and test for the file's existence/validity Parameters ---------- file_id: The unique identifier of the file to be accessed Returns ------- dict 'path' will be a pathlib.Path pointing to the file's location 'exists' will be a boolean indicating if the file exists in a valid state 'file_attributes' is a CacheFileAttributes describing the file in more detail Raises ------ RuntimeError If the file cannot be downloaded """ file_attributes = self._manifest.data_file_attributes(file_id) exists = self._file_exists(file_attributes) local_path = file_attributes.local_path output = { 'local_path': local_path, 'exists': exists, 'file_attributes': file_attributes } return output def download_data(self, file_id) -> pathlib.Path: """ Return the local path to a data file, downloading the file if necessary Parameters ---------- file_id: The unique identifier of the file to be accessed Returns ------- pathlib.Path The path indicating where the file is stored on the local system Raises ------ RuntimeError If the file cannot be downloaded """ super_attributes = self.data_path(file_id) file_attributes = super_attributes['file_attributes'] self._download_file(file_attributes) return file_attributes.local_path def metadata_path(self, fname: str) -> dict: """ Return the local path to a metadata file, and test for the file's existence/validity Parameters ---------- fname: str The name of the metadata file to be accessed Returns ------- dict 'path' will be a pathlib.Path pointing to the file's location 'exists' will be a boolean indicating if the file exists in a valid state 'file_attributes' is a CacheFileAttributes describing the file in more detail Raises ------ RuntimeError If the file cannot be downloaded """ file_attributes = self._manifest.metadata_file_attributes(fname) exists = self._file_exists(file_attributes) local_path = file_attributes.local_path output = { 'local_path': local_path, 'exists': exists, 'file_attributes': file_attributes } return output def download_metadata(self, fname: str) -> pathlib.Path: """ Return the local path to a metadata file, downloading the file if necessary Parameters ---------- fname: str The name of the metadata file to be accessed Returns ------- pathlib.Path The path indicating where the file is stored on the local system Raises ------ RuntimeError If the file cannot be downloaded """ super_attributes = self.metadata_path(fname) file_attributes = super_attributes['file_attributes'] self._download_file(file_attributes) return file_attributes.local_path def get_metadata(self, fname: str) -> pd.DataFrame: """ Return a pandas DataFrame of metadata Parameters ---------- fname: str The name of the metadata file to load Returns ------- pd.DataFrame Notes ----- This method will check to see if the specified metadata file exists locally. If it does not, the method will download the file. Use self.metadata_path() to find where the file is stored """ local_path = self.download_metadata(fname) return pd.read_csv(local_path)