Пример #1
0
def uploadFromPath(localFilePath, partSize, bucket, fileID, headers):
    """
    Uploads a file to s3, using multipart uploading if applicable

    :param str localFilePath: Path of the file to upload to s3
    :param int partSize: max size of each part in the multipart upload, in bytes
    :param boto.s3.Bucket bucket: the s3 bucket to upload to
    :param str fileID: the name of the file to upload to
    :param headers: http headers to use when uploading - generally used for encryption purposes
    :return: version of the newly uploaded file
    """
    file_size, file_time = fileSizeAndTime(localFilePath)
    if file_size <= partSize:
        key = bucket.new_key(key_name=compat_bytes(fileID))
        key.name = fileID
        for attempt in retry_s3():
            with attempt:
                key.set_contents_from_filename(localFilePath, headers=headers)
        version = key.version_id
    else:
        with open(localFilePath, 'rb') as f:
            version = chunkedFileUpload(f, bucket, fileID, file_size, headers, partSize)
    for attempt in retry_s3():
        with attempt:
            key = bucket.get_key(compat_bytes(fileID),
                                 headers=headers,
                                 version_id=version)
    assert key.size == file_size
    # Make reasonably sure that the file wasn't touched during the upload
    assert fileSizeAndTime(localFilePath) == (file_size, file_time)
    return version
Пример #2
0
def copyKeyMultipart(srcBucketName, srcKeyName, srcKeyVersion, dstBucketName, dstKeyName, sseAlgorithm=None, sseKey=None,
                     copySourceSseAlgorithm=None, copySourceSseKey=None):
    """
    Copies a key from a source key to a destination key in multiple parts. Note that if the
    destination key exists it will be overwritten implicitly, and if it does not exist a new
    key will be created. If the destination bucket does not exist an error will be raised.

    :param str srcBucketName: The name of the bucket to be copied from.
    :param str srcKeyName: The name of the key to be copied from.
    :param str srcKeyVersion: The version of the key to be copied from.
    :param str dstBucketName: The name of the destination bucket for the copy.
    :param str dstKeyName: The name of the destination key that will be created or overwritten.
    :param str sseAlgorithm: Server-side encryption algorithm for the destination.
    :param str sseKey: Server-side encryption key for the destination.
    :param str copySourceSseAlgorithm: Server-side encryption algorithm for the source.
    :param str copySourceSseKey: Server-side encryption key for the source.

    :rtype: str
    :return: The version of the copied file (or None if versioning is not enabled for dstBucket).
    """
    s3 = boto3.resource('s3')
    dstBucket = s3.Bucket(compat_bytes(dstBucketName))
    dstObject = dstBucket.Object(compat_bytes(dstKeyName))
    copySource = {'Bucket': compat_bytes(srcBucketName), 'Key': compat_bytes(srcKeyName)}
    if srcKeyVersion is not None:
        copySource['VersionId'] = compat_bytes(srcKeyVersion)

    # The boto3 functions don't allow passing parameters as None to
    # indicate they weren't provided. So we have to do a bit of work
    # to ensure we only provide the parameters when they are actually
    # required.
    destEncryptionArgs = {}
    if sseKey is not None:
        destEncryptionArgs.update({'SSECustomerAlgorithm': sseAlgorithm,
                                   'SSECustomerKey': sseKey})
    copyEncryptionArgs = {}
    if copySourceSseKey is not None:
        copyEncryptionArgs.update({'CopySourceSSECustomerAlgorithm': copySourceSseAlgorithm,
                                   'CopySourceSSECustomerKey': copySourceSseKey})
    copyEncryptionArgs.update(destEncryptionArgs)

    dstObject.copy(copySource, ExtraArgs=copyEncryptionArgs)

    # Wait until the object exists before calling head_object
    object_summary = s3.ObjectSummary(dstObject.bucket_name, dstObject.key)
    object_summary.wait_until_exists(**destEncryptionArgs)

    # Unfortunately, boto3's managed copy doesn't return the version
    # that it actually copied to. So we have to check immediately
    # after, leaving open the possibility that it may have been
    # modified again in the few seconds since the copy finished. There
    # isn't much we can do about it.
    info = boto3.client('s3').head_object(Bucket=dstObject.bucket_name, Key=dstObject.key,
                                          **destEncryptionArgs)
    return info.get('VersionId', None)
Пример #3
0
def uploadFromPath(localFilePath: str,
                   resource,
                   bucketName: str,
                   fileID: str,
                   headerArgs: Optional[dict] = None,
                   partSize: int = 50 << 20):
    """
    Uploads a file to s3, using multipart uploading if applicable

    :param str localFilePath: Path of the file to upload to s3
    :param S3.Resource resource: boto3 resource
    :param str bucketName: name of the bucket to upload to
    :param str fileID: the name of the file to upload to
    :param dict headerArgs: http headers to use when uploading - generally used for encryption purposes
    :param int partSize: max size of each part in the multipart upload, in bytes

    :return: version of the newly uploaded file
    """
    if headerArgs is None:
        headerArgs = {}

    client = resource.meta.client
    file_size, file_time = fileSizeAndTime(localFilePath)

    version = uploadFile(localFilePath, resource, bucketName, fileID, headerArgs, partSize)
    info = client.head_object(Bucket=bucketName, Key=compat_bytes(fileID), VersionId=version, **headerArgs)
    size = info.get('ContentLength')

    assert size == file_size

    # Make reasonably sure that the file wasn't touched during the upload
    assert fileSizeAndTime(localFilePath) == (file_size, file_time)
    return version
Пример #4
0
def chunkedFileUpload(readable, bucket, fileID, file_size, headers=None, partSize=50 << 20):
    for attempt in retry_s3():
        with attempt:
            upload = bucket.initiate_multipart_upload(
                key_name=compat_bytes(fileID),
                headers=headers)
    try:
        start = 0
        part_num = itertools.count()
        while start < file_size:
            end = min(start + partSize, file_size)
            assert readable.tell() == start
            for attempt in retry_s3():
                with attempt:
                    upload.upload_part_from_file(fp=readable,
                                                 part_num=next(part_num) + 1,
                                                 size=end - start,
                                                 headers=headers)
            start = end
        assert readable.tell() == file_size == start
    except:
        with panic(log=log):
            for attempt in retry_s3():
                with attempt:
                    upload.cancel_upload()
    else:
        for attempt in retry_s3():
            with attempt:
                version = upload.complete_upload().version_id
    return version
Пример #5
0
    def _download_stream(self, fileName, encrypt=True, encoding=None, errors=None):
        """
        Yields a context manager that can be used to read from the bucket
        with a stream. See :class:`~toil.jobStores.utils.WritablePipe` for an example.

        :param fileName: name of file in bucket to be read
        :type fileName: str

        :param encrypt: whether or not the file is encrypted
        :type encrypt: bool

        :param str encoding: the name of the encoding used to encode the file. Encodings are the same
                as for encode(). Defaults to None which represents binary mode.

        :param str errors: an optional string that specifies how encoding errors are to be handled. Errors
                are the same as for open(). Defaults to 'strict' when an encoding is specified.

        :return: an instance of ReadablePipe.
        :rtype: :class:`~toil.jobStores.utils.ReadablePipe`
        """

        blob = self.bucket.get_blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None)
        if blob is None:
            raise NoSuchFileException(fileName)

        class DownloadPipe(ReadablePipe):
            def writeTo(self, writable):
                try:
                    blob.download_to_file(writable)
                finally:
                    writable.close()

        with DownloadPipe(encoding=encoding, errors=errors) as readable:
            yield readable
Пример #6
0
    def _upload_stream(self, fileName, update=False, encrypt=True, encoding=None, errors=None):
        """
        Yields a context manager that can be used to write to the bucket
        with a stream. See :class:`~toil.jobStores.utils.WritablePipe` for an example.

        Will throw assertion error if the file shouldn't be updated
        and yet exists.

        :param fileName: name of file to be inserted into bucket
        :type fileName: str

        :param update: whether or not the file is to be updated
        :type update: bool

        :param encrypt: whether or not the file is encrypted
        :type encrypt: bool

        :param str encoding: the name of the encoding used to encode the file. Encodings are the same
                as for encode(). Defaults to None which represents binary mode.

        :param str errors: an optional string that specifies how encoding errors are to be handled. Errors
                are the same as for open(). Defaults to 'strict' when an encoding is specified.

        :return: an instance of WritablePipe.
        :rtype: :class:`~toil.jobStores.utils.writablePipe`
        """
        blob = self.bucket.blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None)
        class UploadPipe(WritablePipe):
            def readFrom(self, readable):
                if not update:
                    assert not blob.exists()
                blob.upload_from_file(readable)

        with UploadPipe(encoding=encoding, errors=errors) as writable:
            yield writable
Пример #7
0
    def _uploadStream(self, fileName, update=False, encrypt=True):
        """
        Yields a context manager that can be used to write to the bucket
        with a stream. See :class:`~toil.jobStores.utils.WritablePipe` for an example.

        Will throw assertion error if the file shouldn't be updated
        and yet exists.

        :param fileName: name of file to be inserted into bucket
        :type fileName: str
        :param update: whether or not the file is to be updated
        :type update: bool
        :param encrypt: whether or not the file is encrypted
        :type encrypt: bool
        :return: an instance of WritablePipe.
        :rtype: :class:`~toil.jobStores.utils.writablePipe`
        """
        blob = self.bucket.blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None)

        class UploadPipe(WritablePipe):
            def readFrom(self, readable):
                if not update:
                    assert not blob.exists()
                blob.upload_from_file(readable)

        with UploadPipe() as writable:
            yield writable
Пример #8
0
    def _getBlobFromURL(cls, url, exists=False):
        """
        Gets the blob specified by the url.

        caution: makes no api request. blob may not ACTUALLY exist

        :param urlparse.ParseResult url: the URL

        :param bool exists: if True, then syncs local blob object with cloud
        and raises exceptions if it doesn't exist remotely

        :return: the blob requested
        :rtype: :class:`~google.cloud.storage.blob.Blob`
        """
        bucketName = url.netloc
        fileName = url.path

        # remove leading '/', which can cause problems if fileName is a path
        if fileName.startswith('/'):
            fileName = fileName[1:]

        storageClient = storage.Client()
        bucket = storageClient.get_bucket(bucketName)
        blob = bucket.blob(compat_bytes(fileName))

        if exists:
            if not blob.exists():
                raise NoSuchFileException
            # sync with cloud so info like size is available
            blob.reload()
        return blob
Пример #9
0
    def _downloadStream(self, fileName, encrypt=True):
        """
        Yields a context manager that can be used to read from the bucket
        with a stream. See :class:`~toil.jobStores.utils.WritablePipe` for an example.

        :param fileName: name of file in bucket to be read
        :type fileName: str
        :param encrypt: whether or not the file is encrypted
        :type encrypt: bool
        :return: an instance of ReadablePipe.
        :rtype: :class:`~toil.jobStores.utils.ReadablePipe`
        """
        blob = self.bucket.get_blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None)
        if blob is None:
            raise NoSuchFileException(fileName)

        class DownloadPipe(ReadablePipe):
            def writeTo(self, writable):
                try:
                    blob.download_to_file(writable)
                finally:
                    writable.close()

        with DownloadPipe() as readable:
            yield readable
Пример #10
0
 def readFile(self, jobStoreFileID, localFilePath, symlink=False):
     # used on non-shared files which will be encrypted if available
     # checking for JobStoreID existence
     if not self.fileExists(jobStoreFileID):
         raise NoSuchFileException(jobStoreFileID)
     with open(localFilePath, 'w') as writeable:
         blob = self.bucket.get_blob(compat_bytes(jobStoreFileID), encryption_key=self.sseKey)
         blob.download_to_file(writeable)
Пример #11
0
def uploadFile(readable,
               resource,
               bucketName: str,
               fileID: str,
               headerArgs: Optional[dict] = None,
               partSize: int = 50 << 20):
    """
    Upload a readable object to s3, using multipart uploading if applicable.
    :param readable: a readable stream or a file path to upload to s3
    :param S3.Resource resource: boto3 resource
    :param str bucketName: name of the bucket to upload to
    :param str fileID: the name of the file to upload to
    :param dict headerArgs: http headers to use when uploading - generally used for encryption purposes
    :param int partSize: max size of each part in the multipart upload, in bytes
    :return: version of the newly uploaded file
    """
    if headerArgs is None:
        headerArgs = {}

    client = resource.meta.client
    config = TransferConfig(multipart_threshold=partSize,
                            multipart_chunksize=partSize,
                            use_threads=True)
    if isinstance(readable, str):
        client.upload_file(Filename=readable,
                           Bucket=bucketName,
                           Key=compat_bytes(fileID),
                           ExtraArgs=headerArgs,
                           Config=config)
    else:
        client.upload_fileobj(Fileobj=readable,
                              Bucket=bucketName,
                              Key=compat_bytes(fileID),
                              ExtraArgs=headerArgs,
                              Config=config)

        # Wait until the object exists before calling head_object
        object_summary = resource.ObjectSummary(bucketName,
                                                compat_bytes(fileID))
        object_summary.wait_until_exists(**headerArgs)

    info = client.head_object(Bucket=bucketName,
                              Key=compat_bytes(fileID),
                              **headerArgs)
    return info.get('VersionId', None)
Пример #12
0
 def _writeFile(self, jobStoreID, fileObj, update=False, encrypt=True):
     blob = self.bucket.blob(compat_bytes(jobStoreID), encryption_key=self.sseKey if encrypt else None)
     if not update:
         # TODO: should probably raise a special exception and be added to all jobStores
         assert not blob.exists()
     else:
         if not blob.exists():
             raise NoSuchFileException(jobStoreID)
     blob.upload_from_file(fileObj)
Пример #13
0
    def readStatsAndLogging(self, callback, readAll=False):
        prefix = self.readStatsBaseID if readAll else self.statsBaseID
        filesRead = 0
        lastTry = False

        while True:
            filesReadThisLoop = 0
            # prefix seems broken
            for blob in self.bucket.list_blobs(prefix=compat_bytes(prefix)):
                try:
                    with self.readSharedFileStream(blob.name) as readable:
                        log.debug("Reading stats file: %s", blob.name)
                        callback(readable)
                        filesReadThisLoop += 1
                    if not readAll:
                        # rename this file by copying it and deleting the old version to avoid
                        # rereading it
                        newID = self.readStatsBaseID + blob.name[
                            len(self.statsBaseID):]
                        # NOTE: just copies then deletes old.
                        self.bucket.rename_blob(blob, compat_bytes(newID))
                except NoSuchFileException:
                    log.debug("Stats file not found: %s", blob.name)
            if readAll:
                # The readAll parameter is only by the toil stats util after the completion of the
                # pipeline. Assume that this means the bucket is in a consistent state when readAll
                # is passed.
                return filesReadThisLoop
            if filesReadThisLoop == 0:
                # Listing is unfortunately eventually consistent so we can't be 100% sure there
                # really aren't any stats files left to read
                if lastTry:
                    # this was our second try, we are reasonably sure there aren't any stats
                    # left to gather
                    break
                # Try one more time in a couple seconds
                time.sleep(5)
                lastTry = True
                continue
            else:
                lastTry = False
                filesRead += filesReadThisLoop

        return filesRead
Пример #14
0
 def read_file(self, file_id, local_path, symlink=False):
     # used on non-shared files which will be encrypted if available
     # checking for JobStoreID existence
     if not self.file_exists(file_id):
         raise NoSuchFileException(file_id)
     with AtomicFileCreate(local_path) as tmpPath:
         with open(tmpPath, 'wb') as writeable:
             blob = self.bucket.get_blob(compat_bytes(file_id), encryption_key=self.sseKey)
             blob.download_to_file(writeable)
     if getattr(file_id, 'executable', False):
         os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR)
Пример #15
0
 def _readContents(self, jobStoreID):
     """
     To be used on files representing jobs only. Which will be encrypted if possible.
     :param jobStoreID: the ID of the job
     :type jobStoreID: str
     :return: contents of the job file
     :rtype: string
     """
     job = self.bucket.get_blob(compat_bytes(jobStoreID), encryption_key=self.sseKey)
     if job is None:
         raise NoSuchJobException(jobStoreID)
     return job.download_as_string()
Пример #16
0
    def initialize(self, config=None):
        try:
            self.bucket = self.storageClient.create_bucket(self.bucketName)
        except exceptions.Conflict:
            raise JobStoreExistsException(self.locator)
        super(GoogleJobStore, self).initialize(config)

        # set up sever side encryption after we set up config in super
        if self.config.sseKey is not None:
            with open(self.config.sseKey) as f:
                self.sseKey = compat_bytes(f.read())
                assert len(self.sseKey) == 32
Пример #17
0
 def job_exists(self, job_id):
     return self.bucket.blob(compat_bytes(job_id), encryption_key=self.sseKey).exists()
Пример #18
0
 def file_exists(self, file_id):
     return self.bucket.blob(compat_bytes(file_id), encryption_key=self.sseKey).exists()
Пример #19
0
 def get_file_size(self, file_id):
     if not self.file_exists(file_id):
         return 0
     return self.bucket.get_blob(compat_bytes(file_id), encryption_key=self.sseKey).size
Пример #20
0
 def _delete(self, jobStoreFileID):
     if self.fileExists(jobStoreFileID):
         self.bucket.get_blob(compat_bytes(jobStoreFileID)).delete()
Пример #21
0
 def getFileSize(self, jobStoreFileID):
     if not self.fileExists(jobStoreFileID):
         return 0
     return self.bucket.get_blob(compat_bytes(jobStoreFileID), encryption_key=self.sseKey).size
Пример #22
0
 def fileExists(self, jobStoreFileID):
     return self.bucket.blob(compat_bytes(jobStoreFileID), encryption_key=self.sseKey).exists()
Пример #23
0
    def delete(self, jobStoreID):
        self._delete(jobStoreID)

        # best effort delete associated files
        for blob in self.bucket.list_blobs(prefix=compat_bytes(jobStoreID)):
            self._delete(blob.name)
Пример #24
0
 def getPublicUrl(self, fileName):
     blob = self.bucket.get_blob(compat_bytes(fileName), encryption_key=self.sseKey)
     if blob is None:
         raise NoSuchFileException(fileName)
     return blob.generate_signed_url(self.publicUrlExpiration)