示例#1
0
    def remove(self, ref):
        """Indicate to the Datastore that a Dataset can be removed.

        .. warning::

            This method does not support transactions; removals are
            immediate, cannot be undone, and are not guaranteed to
            be atomic if deleting either the file or the internal
            database records fails.

        Parameters
        ----------
        ref : `DatasetRef`
            Reference to the required Dataset.

        Raises
        ------
        FileNotFoundError
            Attempt to remove a dataset that does not exist.
        """
        location, storefFileInfo = self._get_dataset_location_info(ref)
        if location is None:
            raise FileNotFoundError(
                f"Requested dataset ({ref}) does not exist")

        if not s3CheckFileExists(location, client=self.client):
            raise FileNotFoundError(f"No such file: {location.uri}")

        # https://github.com/boto/boto3/issues/507 - there is no way of knowing
        # if the file was actually deleted
        self.client.delete_object(Bucket=location.netloc,
                                  Key=location.relativeToPathRoot)

        # Remove rows from registries
        self._remove_from_registry(ref)
    def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
        # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
        if transfer not in (None, "move", "copy"):
            raise NotImplementedError(f"Transfer mode {transfer} not supported.")
        # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
        # target will always be s3). File has to exist at target location. Two
        # Schemeless URIs are assumed to obey os.path rules. Equivalent to
        # os.path.exists(fullPath) check in PosixDatastore.
        srcUri = ButlerURI(path)
        if srcUri.scheme == 'file' or not srcUri.scheme:
            if not os.path.exists(srcUri.ospath):
                raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
        elif srcUri.scheme == 's3':
            if not s3CheckFileExists(srcUri, client=self.client)[0]:
                raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
        else:
            raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.")

        if transfer is None:
            rootUri = ButlerURI(self.root)
            if srcUri.scheme == "file":
                raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. "
                                   "Ingesting local data to S3Datastore without upload "
                                   "to S3 is not allowed.")
            elif srcUri.scheme == "s3":
                if not srcUri.path.startswith(rootUri.path):
                    raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.")
        return path
    def put(self, inMemoryDataset, ref):
        """Write a InMemoryDataset with a given `DatasetRef` to the store.

        Parameters
        ----------
        inMemoryDataset : `object`
            The Dataset to store.
        ref : `DatasetRef`
            Reference to the associated Dataset.

        Raises
        ------
        TypeError
            Supplied object and storage class are inconsistent.
        DatasetTypeNotSupportedError
            The associated `DatasetType` is not handled by this datastore.

        Notes
        -----
        If the datastore is configured to reject certain dataset types it
        is possible that the put will fail and raise a
        `DatasetTypeNotSupportedError`.  The main use case for this is to
        allow `ChainedDatastore` to put to multiple datastores without
        requiring that every datastore accepts the dataset.
        """
        location, formatter = self._prepare_for_put(inMemoryDataset, ref)

        # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
        # `Keys` instead only look like directories, but are not. We check if
        # an *exact* full key already exists before writing instead. The insert
        # key operation is equivalent to creating the dir and the file.
        location.updateExtension(formatter.extension)
        if s3CheckFileExists(location, client=self.client,)[0]:
            raise FileExistsError(f"Cannot write file for ref {ref} as "
                                  f"output file {location.uri} exists.")

        # upload the file directly from bytes or by using a temporary file if
        # _toBytes is not implemented
        try:
            serializedDataset = formatter.toBytes(inMemoryDataset)
            self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
                                   Body=serializedDataset)
            log.debug("Wrote file directly to %s", location.uri)
        except NotImplementedError:
            with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
                formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
                formatter.write(inMemoryDataset)
                self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot,
                                        Filename=tmpFile.name)
                log.debug("Wrote file to %s via a temporary directory.", location.uri)

        # Register a callback to try to delete the uploaded data if
        # the ingest fails below
        self._transaction.registerUndo("write", self.client.delete_object,
                                       Bucket=location.netloc, Key=location.relativeToPathRoot)

        # URI is needed to resolve what ingest case are we dealing with
        info = self._extractIngestInfo(location.uri, ref, formatter=formatter)
        self._register_datasets([(ref, info)])
示例#4
0
    def testFileExists(self):
        self.assertTrue(
            s3CheckFileExists(client=self.client,
                              bucket=self.bucketName,
                              path=self.fileName)[0])
        self.assertFalse(
            s3CheckFileExists(client=self.client,
                              bucket=self.bucketName,
                              path=self.fileName + "_NO_EXIST")[0])

        datastoreRootUri = f"s3://{self.bucketName}/"
        uri = f"s3://{self.bucketName}/{self.fileName}"

        buri = ButlerURI(uri)
        location = Location(datastoreRootUri, self.fileName)

        self.assertTrue(s3CheckFileExists(client=self.client, path=buri)[0])
        # just to make sure the overloaded keyword works correctly
        self.assertTrue(s3CheckFileExists(buri, client=self.client)[0])
        self.assertTrue(
            s3CheckFileExists(client=self.client, path=location)[0])

        # make sure supplying strings resolves correctly too
        self.assertTrue(s3CheckFileExists(uri, client=self.client))
        self.assertTrue(s3CheckFileExists(uri))
    def checkFileExists(self, root, relpath):
        """Checks if file exists at a given path (relative to root).

        Test testPutTemplates verifies actual physical existance of the files
        in the requested location. For S3Datastore this test is equivalent to
        `lsst.daf.butler.core.s3utils.s3checkFileExists` call.
        """
        uri = ButlerURI(root)
        client = boto3.client("s3")
        return s3CheckFileExists(uri, client=client)[0]
示例#6
0
    def _extractIngestInfo(self,
                           path: str,
                           ref: DatasetRef,
                           *,
                           formatter: Union[Formatter, Type[Formatter]],
                           transfer: Optional[str] = None) -> StoredFileInfo:
        # Docstring inherited from FileLikeDatastore._extractIngestInfo.
        srcUri = ButlerURI(path)
        if transfer is None:
            rootUri = ButlerURI(self.root)
            p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
            pathInStore = str(p.relative_to(rootUri.relativeToPathRoot))
            tgtLocation = self.locationFactory.fromPath(pathInStore)
        else:
            assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath"

            # Work out the name we want this ingested file to have
            # inside the datastore
            tgtLocation = self._calculate_ingested_datastore_name(
                srcUri, ref, formatter)

            if srcUri.scheme == "file":
                # source is on local disk.
                with open(srcUri.ospath, 'rb') as f:
                    self.client.put_object(Bucket=tgtLocation.netloc,
                                           Key=tgtLocation.relativeToPathRoot,
                                           Body=f)
                if transfer == "move":
                    os.remove(srcUri.ospath)
            elif srcUri.scheme == "s3":
                # source is another S3 Bucket
                relpath = srcUri.relativeToPathRoot
                copySrc = {"Bucket": srcUri.netloc, "Key": relpath}
                self.client.copy(copySrc, self.locationFactory.netloc,
                                 tgtLocation.relativeToPathRoot)
                if transfer == "move":
                    # https://github.com/boto/boto3/issues/507 - there is no
                    # way of knowing if the file was actually deleted except
                    # for checking all the keys again, reponse is  HTTP 204 OK
                    # response all the time
                    self.client.delete(Bucket=srcUri.netloc, Key=relpath)

        # the file should exist on the bucket by now
        _, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
                                    bucket=tgtLocation.netloc,
                                    client=self.client)

        return StoredFileInfo(formatter=formatter,
                              path=tgtLocation.pathInStore,
                              storageClass=ref.datasetType.storageClass,
                              component=ref.datasetType.component(),
                              file_size=size,
                              checksum=None)
示例#7
0
    def _write_in_memory_to_artifact(self, inMemoryDataset: Any,
                                     ref: DatasetRef) -> StoredFileInfo:
        location, formatter = self._prepare_for_put(inMemoryDataset, ref)

        # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
        # `Keys` instead only look like directories, but are not. We check if
        # an *exact* full key already exists before writing instead. The insert
        # key operation is equivalent to creating the dir and the file.
        if s3CheckFileExists(
                location,
                client=self.client,
        )[0]:
            raise FileExistsError(f"Cannot write file for ref {ref} as "
                                  f"output file {location.uri} exists.")

        # upload the file directly from bytes or by using a temporary file if
        # _toBytes is not implemented
        try:
            serializedDataset = formatter.toBytes(inMemoryDataset)
            log.debug("Writing file directly to %s", location.uri)
            self.client.put_object(Bucket=location.netloc,
                                   Key=location.relativeToPathRoot,
                                   Body=serializedDataset)
            log.debug("Successfully wrote file directly to %s", location.uri)
        except NotImplementedError:
            with tempfile.NamedTemporaryFile(
                    suffix=location.getExtension()) as tmpFile:
                formatter._fileDescriptor.location = Location(
                    *os.path.split(tmpFile.name))
                formatter.write(inMemoryDataset)
                with open(tmpFile.name, 'rb') as f:
                    log.debug("Writing file to %s via a temporary directory.",
                              location.uri)
                    self.client.put_object(Bucket=location.netloc,
                                           Key=location.relativeToPathRoot,
                                           Body=f)
                log.debug(
                    "Successfully wrote file to %s via a temporary directory.",
                    location.uri)

        if self._transaction is None:
            raise RuntimeError(
                "Attempting to write artifact without transaction enabled")

        # Register a callback to try to delete the uploaded data if
        # the ingest fails below
        self._transaction.registerUndo("write",
                                       self.client.delete_object,
                                       Bucket=location.netloc,
                                       Key=location.relativeToPathRoot)

        # URI is needed to resolve what ingest case are we dealing with
        return self._extractIngestInfo(location.uri, ref, formatter=formatter)
示例#8
0
    def exists(self, ref):
        """Check if the dataset exists in the datastore.

        Parameters
        ----------
        ref : `DatasetRef`
            Reference to the required dataset.

        Returns
        -------
        exists : `bool`
            `True` if the entity exists in the `Datastore`.
        """
        location, _ = self._get_dataset_location_info(ref)
        if location is None:
            return False
        return s3CheckFileExists(location, client=self.client)[0]
示例#9
0
    def _artifact_exists(self, location: Location) -> bool:
        """Check that an artifact exists in this datastore at the specified
        location.

        Parameters
        ----------
        location : `Location`
            Expected location of the artifact associated with this datastore.

        Returns
        -------
        exists : `bool`
            True if the location can be found, false otherwise.
        """
        log.debug("Checking if file exists: %s", location.uri)
        exists, _ = s3CheckFileExists(location, client=self.client)
        return exists
示例#10
0
    def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
                           transfer: Optional[str] = None) -> StoredFileInfo:
        # Docstring inherited from FileLikeDatastore._extractIngestInfo.
        srcUri = ButlerURI(path)
        if transfer is None:
            rootUri = ButlerURI(self.root)
            p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
            pathInStore = str(p.relative_to(rootUri.relativeToPathRoot))
            tgtLocation = self.locationFactory.fromPath(pathInStore)
        else:
            assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath"
            if srcUri.scheme == "file":
                # source is on local disk.
                template = self.templates.getTemplate(ref)
                location = self.locationFactory.fromPath(template.format(ref))
                tgtPathInStore = formatter.predictPathFromLocation(location)
                tgtLocation = self.locationFactory.fromPath(tgtPathInStore)
                self.client.upload_file(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot,
                                        Filename=srcUri.ospath)
                if transfer == "move":
                    os.remove(srcUri.ospath)
            elif srcUri.scheme == "s3":
                # source is another S3 Bucket
                relpath = srcUri.relativeToPathRoot
                copySrc = {"Bucket": srcUri.netloc, "Key": relpath}
                self.client.copy(copySrc, self.locationFactory.netloc, relpath)
                if transfer == "move":
                    # https://github.com/boto/boto3/issues/507 - there is no
                    # way of knowing if the file was actually deleted except
                    # for checking all the keys again, reponse is  HTTP 204 OK
                    # response all the time
                    self.client.delete(Bucket=srcUri.netloc, Key=relpath)
                p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
                relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot))
                tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot)

        # the file should exist on the bucket by now
        exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
                                         bucket=tgtLocation.netloc,
                                         client=self.client)

        return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
                              storageClass=ref.datasetType.storageClass,
                              file_size=size, checksum=None)
示例#11
0
    def ingest(self, path, ref, formatter=None, transfer=None):
        """Add an on-disk file with the given `DatasetRef` to the store,
        possibly transferring it.

        The caller is responsible for ensuring that the given (or predicted)
        Formatter is consistent with how the file was written; `ingest` will
        in general silently ignore incorrect formatters (as it cannot
        efficiently verify their correctness), deferring errors until ``get``
        is first called on the ingested dataset.

        Parameters
        ----------
        path : `str`
            File path.  Treated as relative to the repository root if not
            absolute.
        ref : `DatasetRef`
            Reference to the associated Dataset.
        formatter : `Formatter` (optional)
            Formatter that should be used to retreive the Dataset.  If not
            provided, the formatter will be constructed according to
            Datastore configuration.
        transfer : str (optional)
            If not None, must be one of 'move' or 'copy' indicating how to
            transfer the file.  The new filename and location will be
            determined via template substitution, as with ``put``.  If the file
            is outside the datastore root, it must be transferred somehow.

        Raises
        ------
        RuntimeError
            Raised if ``transfer is None`` and path is outside the repository
            root.
        FileNotFoundError
            Raised if the file at ``path`` does not exist.
        FileExistsError
            Raised if ``transfer is not None`` but a file already exists at the
            location computed from the template.
        PermissionError
            Raised when check if file exists at target location in S3 can not
            be made because IAM user used lacks s3:GetObject or s3:ListBucket
            permissions.
        """
        if not self.constraints.isAcceptable(ref):
            # Raise rather than use boolean return value.
            raise DatasetTypeNotSupportedError(
                f"Dataset {ref} has been rejected by this datastore via"
                " configuration.")

        if formatter is None:
            formatter = self.formatterFactory.getFormatterClass(ref)

        # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
        # target will always be s3). File has to exist at target location. Two
        # Schemeless URIs are assumed to obey os.path rules. Equivalent to
        # os.path.exists(fullPath) check in PosixDatastore.
        srcUri = ButlerURI(path)
        if srcUri.scheme == 'file' or not srcUri.scheme:
            if not os.path.exists(srcUri.ospath):
                raise FileNotFoundError(
                    f"File at '{srcUri}' does not exist; note that paths to ingest are "
                    "assumed to be relative to self.root unless they are absolute."
                )
        elif srcUri.scheme == 's3':
            if not s3CheckFileExists(srcUri, client=self.client)[0]:
                raise FileNotFoundError(
                    "File at '{}' does not exist; note that paths to ingest are "
                    "assumed to be relative to self.root unless they are absolute."
                    .format(srcUri))
        else:
            raise NotImplementedError(
                f"Scheme type {srcUri.scheme} not supported.")

        # Transfer is generaly None when put calls ingest. In that case file is
        # uploaded in put, or already in proper location, so source location
        # must be inside repository. In other cases, created target location
        # must be inside root and source file must be deleted when 'move'd.
        if transfer is None:
            rootUri = ButlerURI(self.root)
            if srcUri.scheme == "file":
                raise RuntimeError(
                    f"'{srcUri}' is not inside repository root '{rootUri}'. "
                    "Ingesting local data to S3Datastore without upload "
                    "to S3 is not allowed.")
            elif srcUri.scheme == "s3":
                if not srcUri.path.startswith(rootUri.path):
                    raise RuntimeError(
                        f"'{srcUri}' is not inside repository root '{rootUri}'."
                    )
            p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
            pathInStore = str(p.relative_to(rootUri.relativeToPathRoot))
            tgtLocation = self.locationFactory.fromPath(pathInStore)
        elif transfer == "move" or transfer == "copy":
            if srcUri.scheme == "file":
                # source is on local disk.
                template = self.templates.getTemplate(ref)
                location = self.locationFactory.fromPath(template.format(ref))
                tgtPathInStore = formatter.predictPathFromLocation(location)
                tgtLocation = self.locationFactory.fromPath(tgtPathInStore)
                self.client.upload_file(Bucket=tgtLocation.netloc,
                                        Key=tgtLocation.relativeToPathRoot,
                                        Filename=srcUri.ospath)
                if transfer == "move":
                    os.remove(srcUri.ospath)
            elif srcUri.scheme == "s3":
                # source is another S3 Bucket
                relpath = srcUri.relativeToPathRoot
                copySrc = {"Bucket": srcUri.netloc, "Key": relpath}
                self.client.copy(copySrc, self.locationFactory.netloc, relpath)
                if transfer == "move":
                    # https://github.com/boto/boto3/issues/507 - there is no
                    # way of knowing if the file was actually deleted except
                    # for checking all the keys again, reponse is  HTTP 204 OK
                    # response all the time
                    self.client.delete(Bucket=srcUri.netloc, Key=relpath)
                p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
                relativeToDatastoreRoot = str(
                    p.relative_to(rootUri.relativeToPathRoot))
                tgtLocation = self.locationFactory.fromPath(
                    relativeToDatastoreRoot)
        else:
            raise NotImplementedError(
                f"Transfer type '{transfer}' not supported.")

        # the file should exist on the bucket by now
        exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
                                         bucket=tgtLocation.netloc,
                                         client=self.client)

        # Update the registry
        self._register_dataset_file(ref, formatter, tgtLocation.pathInStore,
                                    size, None)