def uploadFromPath(localFilePath, partSize, bucket, fileID, headers): """ Uploads a file to s3, using multipart uploading if applicable :param str localFilePath: Path of the file to upload to s3 :param int partSize: max size of each part in the multipart upload, in bytes :param boto.s3.Bucket bucket: the s3 bucket to upload to :param str fileID: the name of the file to upload to :param headers: http headers to use when uploading - generally used for encryption purposes :return: version of the newly uploaded file """ file_size, file_time = fileSizeAndTime(localFilePath) if file_size <= partSize: key = bucket.new_key(key_name=compat_bytes(fileID)) key.name = fileID for attempt in retry_s3(): with attempt: key.set_contents_from_filename(localFilePath, headers=headers) version = key.version_id else: with open(localFilePath, 'rb') as f: version = chunkedFileUpload(f, bucket, fileID, file_size, headers, partSize) for attempt in retry_s3(): with attempt: key = bucket.get_key(compat_bytes(fileID), headers=headers, version_id=version) assert key.size == file_size # Make reasonably sure that the file wasn't touched during the upload assert fileSizeAndTime(localFilePath) == (file_size, file_time) return version
def copyKeyMultipart(srcBucketName, srcKeyName, srcKeyVersion, dstBucketName, dstKeyName, sseAlgorithm=None, sseKey=None, copySourceSseAlgorithm=None, copySourceSseKey=None): """ Copies a key from a source key to a destination key in multiple parts. Note that if the destination key exists it will be overwritten implicitly, and if it does not exist a new key will be created. If the destination bucket does not exist an error will be raised. :param str srcBucketName: The name of the bucket to be copied from. :param str srcKeyName: The name of the key to be copied from. :param str srcKeyVersion: The version of the key to be copied from. :param str dstBucketName: The name of the destination bucket for the copy. :param str dstKeyName: The name of the destination key that will be created or overwritten. :param str sseAlgorithm: Server-side encryption algorithm for the destination. :param str sseKey: Server-side encryption key for the destination. :param str copySourceSseAlgorithm: Server-side encryption algorithm for the source. :param str copySourceSseKey: Server-side encryption key for the source. :rtype: str :return: The version of the copied file (or None if versioning is not enabled for dstBucket). """ s3 = boto3.resource('s3') dstBucket = s3.Bucket(compat_bytes(dstBucketName)) dstObject = dstBucket.Object(compat_bytes(dstKeyName)) copySource = {'Bucket': compat_bytes(srcBucketName), 'Key': compat_bytes(srcKeyName)} if srcKeyVersion is not None: copySource['VersionId'] = compat_bytes(srcKeyVersion) # The boto3 functions don't allow passing parameters as None to # indicate they weren't provided. So we have to do a bit of work # to ensure we only provide the parameters when they are actually # required. destEncryptionArgs = {} if sseKey is not None: destEncryptionArgs.update({'SSECustomerAlgorithm': sseAlgorithm, 'SSECustomerKey': sseKey}) copyEncryptionArgs = {} if copySourceSseKey is not None: copyEncryptionArgs.update({'CopySourceSSECustomerAlgorithm': copySourceSseAlgorithm, 'CopySourceSSECustomerKey': copySourceSseKey}) copyEncryptionArgs.update(destEncryptionArgs) dstObject.copy(copySource, ExtraArgs=copyEncryptionArgs) # Wait until the object exists before calling head_object object_summary = s3.ObjectSummary(dstObject.bucket_name, dstObject.key) object_summary.wait_until_exists(**destEncryptionArgs) # Unfortunately, boto3's managed copy doesn't return the version # that it actually copied to. So we have to check immediately # after, leaving open the possibility that it may have been # modified again in the few seconds since the copy finished. There # isn't much we can do about it. info = boto3.client('s3').head_object(Bucket=dstObject.bucket_name, Key=dstObject.key, **destEncryptionArgs) return info.get('VersionId', None)
def uploadFromPath(localFilePath: str, resource, bucketName: str, fileID: str, headerArgs: Optional[dict] = None, partSize: int = 50 << 20): """ Uploads a file to s3, using multipart uploading if applicable :param str localFilePath: Path of the file to upload to s3 :param S3.Resource resource: boto3 resource :param str bucketName: name of the bucket to upload to :param str fileID: the name of the file to upload to :param dict headerArgs: http headers to use when uploading - generally used for encryption purposes :param int partSize: max size of each part in the multipart upload, in bytes :return: version of the newly uploaded file """ if headerArgs is None: headerArgs = {} client = resource.meta.client file_size, file_time = fileSizeAndTime(localFilePath) version = uploadFile(localFilePath, resource, bucketName, fileID, headerArgs, partSize) info = client.head_object(Bucket=bucketName, Key=compat_bytes(fileID), VersionId=version, **headerArgs) size = info.get('ContentLength') assert size == file_size # Make reasonably sure that the file wasn't touched during the upload assert fileSizeAndTime(localFilePath) == (file_size, file_time) return version
def chunkedFileUpload(readable, bucket, fileID, file_size, headers=None, partSize=50 << 20): for attempt in retry_s3(): with attempt: upload = bucket.initiate_multipart_upload( key_name=compat_bytes(fileID), headers=headers) try: start = 0 part_num = itertools.count() while start < file_size: end = min(start + partSize, file_size) assert readable.tell() == start for attempt in retry_s3(): with attempt: upload.upload_part_from_file(fp=readable, part_num=next(part_num) + 1, size=end - start, headers=headers) start = end assert readable.tell() == file_size == start except: with panic(log=log): for attempt in retry_s3(): with attempt: upload.cancel_upload() else: for attempt in retry_s3(): with attempt: version = upload.complete_upload().version_id return version
def _download_stream(self, fileName, encrypt=True, encoding=None, errors=None): """ Yields a context manager that can be used to read from the bucket with a stream. See :class:`~toil.jobStores.utils.WritablePipe` for an example. :param fileName: name of file in bucket to be read :type fileName: str :param encrypt: whether or not the file is encrypted :type encrypt: bool :param str encoding: the name of the encoding used to encode the file. Encodings are the same as for encode(). Defaults to None which represents binary mode. :param str errors: an optional string that specifies how encoding errors are to be handled. Errors are the same as for open(). Defaults to 'strict' when an encoding is specified. :return: an instance of ReadablePipe. :rtype: :class:`~toil.jobStores.utils.ReadablePipe` """ blob = self.bucket.get_blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None) if blob is None: raise NoSuchFileException(fileName) class DownloadPipe(ReadablePipe): def writeTo(self, writable): try: blob.download_to_file(writable) finally: writable.close() with DownloadPipe(encoding=encoding, errors=errors) as readable: yield readable
def _upload_stream(self, fileName, update=False, encrypt=True, encoding=None, errors=None): """ Yields a context manager that can be used to write to the bucket with a stream. See :class:`~toil.jobStores.utils.WritablePipe` for an example. Will throw assertion error if the file shouldn't be updated and yet exists. :param fileName: name of file to be inserted into bucket :type fileName: str :param update: whether or not the file is to be updated :type update: bool :param encrypt: whether or not the file is encrypted :type encrypt: bool :param str encoding: the name of the encoding used to encode the file. Encodings are the same as for encode(). Defaults to None which represents binary mode. :param str errors: an optional string that specifies how encoding errors are to be handled. Errors are the same as for open(). Defaults to 'strict' when an encoding is specified. :return: an instance of WritablePipe. :rtype: :class:`~toil.jobStores.utils.writablePipe` """ blob = self.bucket.blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None) class UploadPipe(WritablePipe): def readFrom(self, readable): if not update: assert not blob.exists() blob.upload_from_file(readable) with UploadPipe(encoding=encoding, errors=errors) as writable: yield writable
def _uploadStream(self, fileName, update=False, encrypt=True): """ Yields a context manager that can be used to write to the bucket with a stream. See :class:`~toil.jobStores.utils.WritablePipe` for an example. Will throw assertion error if the file shouldn't be updated and yet exists. :param fileName: name of file to be inserted into bucket :type fileName: str :param update: whether or not the file is to be updated :type update: bool :param encrypt: whether or not the file is encrypted :type encrypt: bool :return: an instance of WritablePipe. :rtype: :class:`~toil.jobStores.utils.writablePipe` """ blob = self.bucket.blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None) class UploadPipe(WritablePipe): def readFrom(self, readable): if not update: assert not blob.exists() blob.upload_from_file(readable) with UploadPipe() as writable: yield writable
def _getBlobFromURL(cls, url, exists=False): """ Gets the blob specified by the url. caution: makes no api request. blob may not ACTUALLY exist :param urlparse.ParseResult url: the URL :param bool exists: if True, then syncs local blob object with cloud and raises exceptions if it doesn't exist remotely :return: the blob requested :rtype: :class:`~google.cloud.storage.blob.Blob` """ bucketName = url.netloc fileName = url.path # remove leading '/', which can cause problems if fileName is a path if fileName.startswith('/'): fileName = fileName[1:] storageClient = storage.Client() bucket = storageClient.get_bucket(bucketName) blob = bucket.blob(compat_bytes(fileName)) if exists: if not blob.exists(): raise NoSuchFileException # sync with cloud so info like size is available blob.reload() return blob
def _downloadStream(self, fileName, encrypt=True): """ Yields a context manager that can be used to read from the bucket with a stream. See :class:`~toil.jobStores.utils.WritablePipe` for an example. :param fileName: name of file in bucket to be read :type fileName: str :param encrypt: whether or not the file is encrypted :type encrypt: bool :return: an instance of ReadablePipe. :rtype: :class:`~toil.jobStores.utils.ReadablePipe` """ blob = self.bucket.get_blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None) if blob is None: raise NoSuchFileException(fileName) class DownloadPipe(ReadablePipe): def writeTo(self, writable): try: blob.download_to_file(writable) finally: writable.close() with DownloadPipe() as readable: yield readable
def readFile(self, jobStoreFileID, localFilePath, symlink=False): # used on non-shared files which will be encrypted if available # checking for JobStoreID existence if not self.fileExists(jobStoreFileID): raise NoSuchFileException(jobStoreFileID) with open(localFilePath, 'w') as writeable: blob = self.bucket.get_blob(compat_bytes(jobStoreFileID), encryption_key=self.sseKey) blob.download_to_file(writeable)
def uploadFile(readable, resource, bucketName: str, fileID: str, headerArgs: Optional[dict] = None, partSize: int = 50 << 20): """ Upload a readable object to s3, using multipart uploading if applicable. :param readable: a readable stream or a file path to upload to s3 :param S3.Resource resource: boto3 resource :param str bucketName: name of the bucket to upload to :param str fileID: the name of the file to upload to :param dict headerArgs: http headers to use when uploading - generally used for encryption purposes :param int partSize: max size of each part in the multipart upload, in bytes :return: version of the newly uploaded file """ if headerArgs is None: headerArgs = {} client = resource.meta.client config = TransferConfig(multipart_threshold=partSize, multipart_chunksize=partSize, use_threads=True) if isinstance(readable, str): client.upload_file(Filename=readable, Bucket=bucketName, Key=compat_bytes(fileID), ExtraArgs=headerArgs, Config=config) else: client.upload_fileobj(Fileobj=readable, Bucket=bucketName, Key=compat_bytes(fileID), ExtraArgs=headerArgs, Config=config) # Wait until the object exists before calling head_object object_summary = resource.ObjectSummary(bucketName, compat_bytes(fileID)) object_summary.wait_until_exists(**headerArgs) info = client.head_object(Bucket=bucketName, Key=compat_bytes(fileID), **headerArgs) return info.get('VersionId', None)
def _writeFile(self, jobStoreID, fileObj, update=False, encrypt=True): blob = self.bucket.blob(compat_bytes(jobStoreID), encryption_key=self.sseKey if encrypt else None) if not update: # TODO: should probably raise a special exception and be added to all jobStores assert not blob.exists() else: if not blob.exists(): raise NoSuchFileException(jobStoreID) blob.upload_from_file(fileObj)
def readStatsAndLogging(self, callback, readAll=False): prefix = self.readStatsBaseID if readAll else self.statsBaseID filesRead = 0 lastTry = False while True: filesReadThisLoop = 0 # prefix seems broken for blob in self.bucket.list_blobs(prefix=compat_bytes(prefix)): try: with self.readSharedFileStream(blob.name) as readable: log.debug("Reading stats file: %s", blob.name) callback(readable) filesReadThisLoop += 1 if not readAll: # rename this file by copying it and deleting the old version to avoid # rereading it newID = self.readStatsBaseID + blob.name[ len(self.statsBaseID):] # NOTE: just copies then deletes old. self.bucket.rename_blob(blob, compat_bytes(newID)) except NoSuchFileException: log.debug("Stats file not found: %s", blob.name) if readAll: # The readAll parameter is only by the toil stats util after the completion of the # pipeline. Assume that this means the bucket is in a consistent state when readAll # is passed. return filesReadThisLoop if filesReadThisLoop == 0: # Listing is unfortunately eventually consistent so we can't be 100% sure there # really aren't any stats files left to read if lastTry: # this was our second try, we are reasonably sure there aren't any stats # left to gather break # Try one more time in a couple seconds time.sleep(5) lastTry = True continue else: lastTry = False filesRead += filesReadThisLoop return filesRead
def read_file(self, file_id, local_path, symlink=False): # used on non-shared files which will be encrypted if available # checking for JobStoreID existence if not self.file_exists(file_id): raise NoSuchFileException(file_id) with AtomicFileCreate(local_path) as tmpPath: with open(tmpPath, 'wb') as writeable: blob = self.bucket.get_blob(compat_bytes(file_id), encryption_key=self.sseKey) blob.download_to_file(writeable) if getattr(file_id, 'executable', False): os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR)
def _readContents(self, jobStoreID): """ To be used on files representing jobs only. Which will be encrypted if possible. :param jobStoreID: the ID of the job :type jobStoreID: str :return: contents of the job file :rtype: string """ job = self.bucket.get_blob(compat_bytes(jobStoreID), encryption_key=self.sseKey) if job is None: raise NoSuchJobException(jobStoreID) return job.download_as_string()
def initialize(self, config=None): try: self.bucket = self.storageClient.create_bucket(self.bucketName) except exceptions.Conflict: raise JobStoreExistsException(self.locator) super(GoogleJobStore, self).initialize(config) # set up sever side encryption after we set up config in super if self.config.sseKey is not None: with open(self.config.sseKey) as f: self.sseKey = compat_bytes(f.read()) assert len(self.sseKey) == 32
def job_exists(self, job_id): return self.bucket.blob(compat_bytes(job_id), encryption_key=self.sseKey).exists()
def file_exists(self, file_id): return self.bucket.blob(compat_bytes(file_id), encryption_key=self.sseKey).exists()
def get_file_size(self, file_id): if not self.file_exists(file_id): return 0 return self.bucket.get_blob(compat_bytes(file_id), encryption_key=self.sseKey).size
def _delete(self, jobStoreFileID): if self.fileExists(jobStoreFileID): self.bucket.get_blob(compat_bytes(jobStoreFileID)).delete()
def getFileSize(self, jobStoreFileID): if not self.fileExists(jobStoreFileID): return 0 return self.bucket.get_blob(compat_bytes(jobStoreFileID), encryption_key=self.sseKey).size
def fileExists(self, jobStoreFileID): return self.bucket.blob(compat_bytes(jobStoreFileID), encryption_key=self.sseKey).exists()
def delete(self, jobStoreID): self._delete(jobStoreID) # best effort delete associated files for blob in self.bucket.list_blobs(prefix=compat_bytes(jobStoreID)): self._delete(blob.name)
def getPublicUrl(self, fileName): blob = self.bucket.get_blob(compat_bytes(fileName), encryption_key=self.sseKey) if blob is None: raise NoSuchFileException(fileName) return blob.generate_signed_url(self.publicUrlExpiration)