def remove(self, ref): """Indicate to the Datastore that a Dataset can be removed. .. warning:: This method does not support transactions; removals are immediate, cannot be undone, and are not guaranteed to be atomic if deleting either the file or the internal database records fails. Parameters ---------- ref : `DatasetRef` Reference to the required Dataset. Raises ------ FileNotFoundError Attempt to remove a dataset that does not exist. """ location, storefFileInfo = self._get_dataset_location_info(ref) if location is None: raise FileNotFoundError( f"Requested dataset ({ref}) does not exist") if not s3CheckFileExists(location, client=self.client): raise FileNotFoundError(f"No such file: {location.uri}") # https://github.com/boto/boto3/issues/507 - there is no way of knowing # if the file was actually deleted self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot) # Remove rows from registries self._remove_from_registry(ref)
def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: # Docstring inherited from FileLikeDatastore._standardizeIngestPath. if transfer not in (None, "move", "copy"): raise NotImplementedError(f"Transfer mode {transfer} not supported.") # ingest can occur from file->s3 and s3->s3 (source can be file or s3, # target will always be s3). File has to exist at target location. Two # Schemeless URIs are assumed to obey os.path rules. Equivalent to # os.path.exists(fullPath) check in PosixDatastore. srcUri = ButlerURI(path) if srcUri.scheme == 'file' or not srcUri.scheme: if not os.path.exists(srcUri.ospath): raise FileNotFoundError(f"File at '{srcUri}' does not exist.") elif srcUri.scheme == 's3': if not s3CheckFileExists(srcUri, client=self.client)[0]: raise FileNotFoundError(f"File at '{srcUri}' does not exist.") else: raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.") if transfer is None: rootUri = ButlerURI(self.root) if srcUri.scheme == "file": raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. " "Ingesting local data to S3Datastore without upload " "to S3 is not allowed.") elif srcUri.scheme == "s3": if not srcUri.path.startswith(rootUri.path): raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.") return path
def put(self, inMemoryDataset, ref): """Write a InMemoryDataset with a given `DatasetRef` to the store. Parameters ---------- inMemoryDataset : `object` The Dataset to store. ref : `DatasetRef` Reference to the associated Dataset. Raises ------ TypeError Supplied object and storage class are inconsistent. DatasetTypeNotSupportedError The associated `DatasetType` is not handled by this datastore. Notes ----- If the datastore is configured to reject certain dataset types it is possible that the put will fail and raise a `DatasetTypeNotSupportedError`. The main use case for this is to allow `ChainedDatastore` to put to multiple datastores without requiring that every datastore accepts the dataset. """ location, formatter = self._prepare_for_put(inMemoryDataset, ref) # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 # `Keys` instead only look like directories, but are not. We check if # an *exact* full key already exists before writing instead. The insert # key operation is equivalent to creating the dir and the file. location.updateExtension(formatter.extension) if s3CheckFileExists(location, client=self.client,)[0]: raise FileExistsError(f"Cannot write file for ref {ref} as " f"output file {location.uri} exists.") # upload the file directly from bytes or by using a temporary file if # _toBytes is not implemented try: serializedDataset = formatter.toBytes(inMemoryDataset) self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, Body=serializedDataset) log.debug("Wrote file directly to %s", location.uri) except NotImplementedError: with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) formatter.write(inMemoryDataset) self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot, Filename=tmpFile.name) log.debug("Wrote file to %s via a temporary directory.", location.uri) # Register a callback to try to delete the uploaded data if # the ingest fails below self._transaction.registerUndo("write", self.client.delete_object, Bucket=location.netloc, Key=location.relativeToPathRoot) # URI is needed to resolve what ingest case are we dealing with info = self._extractIngestInfo(location.uri, ref, formatter=formatter) self._register_datasets([(ref, info)])
def testFileExists(self): self.assertTrue( s3CheckFileExists(client=self.client, bucket=self.bucketName, path=self.fileName)[0]) self.assertFalse( s3CheckFileExists(client=self.client, bucket=self.bucketName, path=self.fileName + "_NO_EXIST")[0]) datastoreRootUri = f"s3://{self.bucketName}/" uri = f"s3://{self.bucketName}/{self.fileName}" buri = ButlerURI(uri) location = Location(datastoreRootUri, self.fileName) self.assertTrue(s3CheckFileExists(client=self.client, path=buri)[0]) # just to make sure the overloaded keyword works correctly self.assertTrue(s3CheckFileExists(buri, client=self.client)[0]) self.assertTrue( s3CheckFileExists(client=self.client, path=location)[0]) # make sure supplying strings resolves correctly too self.assertTrue(s3CheckFileExists(uri, client=self.client)) self.assertTrue(s3CheckFileExists(uri))
def checkFileExists(self, root, relpath): """Checks if file exists at a given path (relative to root). Test testPutTemplates verifies actual physical existance of the files in the requested location. For S3Datastore this test is equivalent to `lsst.daf.butler.core.s3utils.s3checkFileExists` call. """ uri = ButlerURI(root) client = boto3.client("s3") return s3CheckFileExists(uri, client=client)[0]
def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Union[Formatter, Type[Formatter]], transfer: Optional[str] = None) -> StoredFileInfo: # Docstring inherited from FileLikeDatastore._extractIngestInfo. srcUri = ButlerURI(path) if transfer is None: rootUri = ButlerURI(self.root) p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) tgtLocation = self.locationFactory.fromPath(pathInStore) else: assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath" # Work out the name we want this ingested file to have # inside the datastore tgtLocation = self._calculate_ingested_datastore_name( srcUri, ref, formatter) if srcUri.scheme == "file": # source is on local disk. with open(srcUri.ospath, 'rb') as f: self.client.put_object(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot, Body=f) if transfer == "move": os.remove(srcUri.ospath) elif srcUri.scheme == "s3": # source is another S3 Bucket relpath = srcUri.relativeToPathRoot copySrc = {"Bucket": srcUri.netloc, "Key": relpath} self.client.copy(copySrc, self.locationFactory.netloc, tgtLocation.relativeToPathRoot) if transfer == "move": # https://github.com/boto/boto3/issues/507 - there is no # way of knowing if the file was actually deleted except # for checking all the keys again, reponse is HTTP 204 OK # response all the time self.client.delete(Bucket=srcUri.netloc, Key=relpath) # the file should exist on the bucket by now _, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, bucket=tgtLocation.netloc, client=self.client) return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, storageClass=ref.datasetType.storageClass, component=ref.datasetType.component(), file_size=size, checksum=None)
def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: location, formatter = self._prepare_for_put(inMemoryDataset, ref) # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 # `Keys` instead only look like directories, but are not. We check if # an *exact* full key already exists before writing instead. The insert # key operation is equivalent to creating the dir and the file. if s3CheckFileExists( location, client=self.client, )[0]: raise FileExistsError(f"Cannot write file for ref {ref} as " f"output file {location.uri} exists.") # upload the file directly from bytes or by using a temporary file if # _toBytes is not implemented try: serializedDataset = formatter.toBytes(inMemoryDataset) log.debug("Writing file directly to %s", location.uri) self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, Body=serializedDataset) log.debug("Successfully wrote file directly to %s", location.uri) except NotImplementedError: with tempfile.NamedTemporaryFile( suffix=location.getExtension()) as tmpFile: formatter._fileDescriptor.location = Location( *os.path.split(tmpFile.name)) formatter.write(inMemoryDataset) with open(tmpFile.name, 'rb') as f: log.debug("Writing file to %s via a temporary directory.", location.uri) self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, Body=f) log.debug( "Successfully wrote file to %s via a temporary directory.", location.uri) if self._transaction is None: raise RuntimeError( "Attempting to write artifact without transaction enabled") # Register a callback to try to delete the uploaded data if # the ingest fails below self._transaction.registerUndo("write", self.client.delete_object, Bucket=location.netloc, Key=location.relativeToPathRoot) # URI is needed to resolve what ingest case are we dealing with return self._extractIngestInfo(location.uri, ref, formatter=formatter)
def exists(self, ref): """Check if the dataset exists in the datastore. Parameters ---------- ref : `DatasetRef` Reference to the required dataset. Returns ------- exists : `bool` `True` if the entity exists in the `Datastore`. """ location, _ = self._get_dataset_location_info(ref) if location is None: return False return s3CheckFileExists(location, client=self.client)[0]
def _artifact_exists(self, location: Location) -> bool: """Check that an artifact exists in this datastore at the specified location. Parameters ---------- location : `Location` Expected location of the artifact associated with this datastore. Returns ------- exists : `bool` True if the location can be found, false otherwise. """ log.debug("Checking if file exists: %s", location.uri) exists, _ = s3CheckFileExists(location, client=self.client) return exists
def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], transfer: Optional[str] = None) -> StoredFileInfo: # Docstring inherited from FileLikeDatastore._extractIngestInfo. srcUri = ButlerURI(path) if transfer is None: rootUri = ButlerURI(self.root) p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) tgtLocation = self.locationFactory.fromPath(pathInStore) else: assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath" if srcUri.scheme == "file": # source is on local disk. template = self.templates.getTemplate(ref) location = self.locationFactory.fromPath(template.format(ref)) tgtPathInStore = formatter.predictPathFromLocation(location) tgtLocation = self.locationFactory.fromPath(tgtPathInStore) self.client.upload_file(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot, Filename=srcUri.ospath) if transfer == "move": os.remove(srcUri.ospath) elif srcUri.scheme == "s3": # source is another S3 Bucket relpath = srcUri.relativeToPathRoot copySrc = {"Bucket": srcUri.netloc, "Key": relpath} self.client.copy(copySrc, self.locationFactory.netloc, relpath) if transfer == "move": # https://github.com/boto/boto3/issues/507 - there is no # way of knowing if the file was actually deleted except # for checking all the keys again, reponse is HTTP 204 OK # response all the time self.client.delete(Bucket=srcUri.netloc, Key=relpath) p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot)) tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot) # the file should exist on the bucket by now exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, bucket=tgtLocation.netloc, client=self.client) return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, storageClass=ref.datasetType.storageClass, file_size=size, checksum=None)
def ingest(self, path, ref, formatter=None, transfer=None): """Add an on-disk file with the given `DatasetRef` to the store, possibly transferring it. The caller is responsible for ensuring that the given (or predicted) Formatter is consistent with how the file was written; `ingest` will in general silently ignore incorrect formatters (as it cannot efficiently verify their correctness), deferring errors until ``get`` is first called on the ingested dataset. Parameters ---------- path : `str` File path. Treated as relative to the repository root if not absolute. ref : `DatasetRef` Reference to the associated Dataset. formatter : `Formatter` (optional) Formatter that should be used to retreive the Dataset. If not provided, the formatter will be constructed according to Datastore configuration. transfer : str (optional) If not None, must be one of 'move' or 'copy' indicating how to transfer the file. The new filename and location will be determined via template substitution, as with ``put``. If the file is outside the datastore root, it must be transferred somehow. Raises ------ RuntimeError Raised if ``transfer is None`` and path is outside the repository root. FileNotFoundError Raised if the file at ``path`` does not exist. FileExistsError Raised if ``transfer is not None`` but a file already exists at the location computed from the template. PermissionError Raised when check if file exists at target location in S3 can not be made because IAM user used lacks s3:GetObject or s3:ListBucket permissions. """ if not self.constraints.isAcceptable(ref): # Raise rather than use boolean return value. raise DatasetTypeNotSupportedError( f"Dataset {ref} has been rejected by this datastore via" " configuration.") if formatter is None: formatter = self.formatterFactory.getFormatterClass(ref) # ingest can occur from file->s3 and s3->s3 (source can be file or s3, # target will always be s3). File has to exist at target location. Two # Schemeless URIs are assumed to obey os.path rules. Equivalent to # os.path.exists(fullPath) check in PosixDatastore. srcUri = ButlerURI(path) if srcUri.scheme == 'file' or not srcUri.scheme: if not os.path.exists(srcUri.ospath): raise FileNotFoundError( f"File at '{srcUri}' does not exist; note that paths to ingest are " "assumed to be relative to self.root unless they are absolute." ) elif srcUri.scheme == 's3': if not s3CheckFileExists(srcUri, client=self.client)[0]: raise FileNotFoundError( "File at '{}' does not exist; note that paths to ingest are " "assumed to be relative to self.root unless they are absolute." .format(srcUri)) else: raise NotImplementedError( f"Scheme type {srcUri.scheme} not supported.") # Transfer is generaly None when put calls ingest. In that case file is # uploaded in put, or already in proper location, so source location # must be inside repository. In other cases, created target location # must be inside root and source file must be deleted when 'move'd. if transfer is None: rootUri = ButlerURI(self.root) if srcUri.scheme == "file": raise RuntimeError( f"'{srcUri}' is not inside repository root '{rootUri}'. " "Ingesting local data to S3Datastore without upload " "to S3 is not allowed.") elif srcUri.scheme == "s3": if not srcUri.path.startswith(rootUri.path): raise RuntimeError( f"'{srcUri}' is not inside repository root '{rootUri}'." ) p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) tgtLocation = self.locationFactory.fromPath(pathInStore) elif transfer == "move" or transfer == "copy": if srcUri.scheme == "file": # source is on local disk. template = self.templates.getTemplate(ref) location = self.locationFactory.fromPath(template.format(ref)) tgtPathInStore = formatter.predictPathFromLocation(location) tgtLocation = self.locationFactory.fromPath(tgtPathInStore) self.client.upload_file(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot, Filename=srcUri.ospath) if transfer == "move": os.remove(srcUri.ospath) elif srcUri.scheme == "s3": # source is another S3 Bucket relpath = srcUri.relativeToPathRoot copySrc = {"Bucket": srcUri.netloc, "Key": relpath} self.client.copy(copySrc, self.locationFactory.netloc, relpath) if transfer == "move": # https://github.com/boto/boto3/issues/507 - there is no # way of knowing if the file was actually deleted except # for checking all the keys again, reponse is HTTP 204 OK # response all the time self.client.delete(Bucket=srcUri.netloc, Key=relpath) p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) relativeToDatastoreRoot = str( p.relative_to(rootUri.relativeToPathRoot)) tgtLocation = self.locationFactory.fromPath( relativeToDatastoreRoot) else: raise NotImplementedError( f"Transfer type '{transfer}' not supported.") # the file should exist on the bucket by now exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, bucket=tgtLocation.netloc, client=self.client) # Update the registry self._register_dataset_file(ref, formatter, tgtLocation.pathInStore, size, None)