def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, ref: DatasetRef, isComponent: bool = False) -> Any: location = getInfo.location log.debug("Downloading data from %s", location.uri) serializedDataset = location.uri.read() storedFileInfo = getInfo.info if len(serializedDataset) != storedFileInfo.file_size: raise RuntimeError( "Integrity failure in Datastore. " f"Size of file {location.path} ({len(serializedDataset)}) " f"does not match recorded size of {storedFileInfo.file_size}") # format the downloaded bytes into appropriate object directly, or via # tempfile (when formatter does not support to/from/Bytes). This is # equivalent of PosixDatastore formatter.read try-except block. formatter = getInfo.formatter try: result = formatter.fromBytes( serializedDataset, component=getInfo.component if isComponent else None) except NotImplementedError: # formatter might not always have an extension so mypy complains # We can either ignore the complaint or use a temporary location tmpLoc = Location(".", "temp") tmpLoc = formatter.makeUpdatedLocation(tmpLoc) with tempfile.NamedTemporaryFile( suffix=tmpLoc.getExtension()) as tmpFile: tmpFile.write(serializedDataset) # Flush the write. Do not close the file because that # will delete it. tmpFile.flush() formatter._fileDescriptor.location = Location( *os.path.split(tmpFile.name)) result = formatter.read( component=getInfo.component if isComponent else None) except Exception as e: raise ValueError( f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" f" ({ref.datasetType.name} from {location.uri}): {e}") from e return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent)
def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, ref: DatasetRef, isComponent: bool = False) -> Any: location = getInfo.location # since we have to make a GET request to S3 anyhow (for download) we # might as well use the HEADER metadata for size comparison instead. # s3CheckFileExists would just duplicate GET/LIST charges in this case. try: log.debug("Reading file: %s", location.uri) response = self.client.get_object(Bucket=location.netloc, Key=location.relativeToPathRoot) log.debug("Successfully read file: %s", location.uri) except self.client.exceptions.ClientError as err: errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] # head_object returns 404 when object does not exist only when user # has s3:ListBucket permission. If list permission does not exist a # 403 is returned. In practical terms this usually means that the # file does not exist, but it could also mean user lacks GetObject # permission. It's hard to tell which case is it. # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html # Unit tests right now demand FileExistsError is raised, but this # should be updated to PermissionError like in s3CheckFileExists. if errorcode == 403: raise FileNotFoundError( f"Dataset with Id {ref.id} not accessible at " f"expected location {location}. Forbidden HEAD " "operation error occured. Verify s3:ListBucket " "and s3:GetObject permissions are granted for " "your IAM user and that file exists. ") from err if errorcode == 404: errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." raise FileNotFoundError(errmsg) from err # other errors are reraised also, but less descriptively raise err storedFileInfo = getInfo.info if response["ContentLength"] != storedFileInfo.file_size: raise RuntimeError( "Integrity failure in Datastore. Size of file {} ({}) does not" " match recorded size of {}".format(location.path, response["ContentLength"], storedFileInfo.file_size)) # download the data as bytes serializedDataset = response["Body"].read() # format the downloaded bytes into appropriate object directly, or via # tempfile (when formatter does not support to/from/Bytes). This is S3 # equivalent of PosixDatastore formatter.read try-except block. formatter = getInfo.formatter try: result = formatter.fromBytes( serializedDataset, component=getInfo.component if isComponent else None) except NotImplementedError: # formatter might not always have an extension so mypy complains # We can either ignore the complaint or use a temporary location tmpLoc = Location(".", "temp") tmpLoc = formatter.makeUpdatedLocation(tmpLoc) with tempfile.NamedTemporaryFile( suffix=tmpLoc.getExtension()) as tmpFile: tmpFile.write(serializedDataset) # Flush the write. Do not close the file because that # will delete it. tmpFile.flush() formatter._fileDescriptor.location = Location( *os.path.split(tmpFile.name)) result = formatter.read( component=getInfo.component if isComponent else None) except Exception as e: raise ValueError( f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" f" ({ref.datasetType.name} from {location.uri}): {e}") from e return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent)