def get_blobs(self, container: Container) -> Iterable[Blob]: azure_container = self._get_azure_container(container.name) azure_blobs = self.service.list_blobs(azure_container.name, include=Include(metadata=True)) for azure_blob in azure_blobs: yield self._convert_azure_blob(container, azure_blob)
def list_object_keys(self, prefix='', metadata=False, pagesize=1000): """List object keys matching a prefix for the WABS client :param str prefix: A prefix string to list objects :param bool metadata: If set to True, object metadata will be fetched with object. Default is False :param int pagesize: Maximum objects to be fetched in a single WABS api call. This is limited to upto 5000 objects in WABS :returns: A generator of object dictionary with key, size and last_modified keys. Metadata will be returned if set to True :rtype: Iterator[dict] """ logger.debug("Listing files for prefix: {0}".format(prefix)) include = Include(metadata=metadata) marker = None while True: if marker: logger.debug("Paging objects " "from marker '{0}'".format(marker)) objects = self.client.list_blobs(self.container_name, prefix=prefix, num_results=pagesize, include=include, marker=marker) for obj in objects: yield { 'key': obj.name, 'last_modified': obj.properties.last_modified, 'size': obj.properties.content_length, 'metadata': obj.metadata } if objects.next_marker: marker = objects.next_marker else: break
def get_blobs(self, container: Container, prefix: str = '', delimiter: str = '') -> Iterable[Blob]: azure_container = self._get_azure_container(container.name) azure_blobs = self.service.list_blobs(azure_container.name, prefix=prefix, delimiter=delimiter, include=Include(metadata=True)) for azure_blob in azure_blobs: yield self._convert_azure_blob(container, azure_blob)
def list_from_group(self, file_group, remote_path=None, num_results=None): """List the files in the file group. :param str file_group: The file group from which to list the files. :param str remote_path: The remote file prefix by which to filter results. :param int num_results: The max number of files to return. :returns: A generator of files. Each file is represented as a dictionary with the following keys: 'name': The full remote file path. 'last_modified': The time stamp for when the file was last modified locally. 'size': The content length of the file. 'uploaded': The time stamp for whe the file was last modified remotely. """ storage_client = self.get_storage_client() container = file_utils.get_container_name(file_group) properties = Include(metadata=True) return ({ 'name': b.name, 'last_modified': b.metadata.get('lastmodified'), 'size': b.properties.content_length, 'uploaded': b.properties.last_modified } for b in storage_client.list_blobs(container, prefix=remote_path, num_results=num_results, include=properties))
def storage_blob_copy_batch(client, source_account, source_container, destination_container, source_sas=None, prefix=None, recursive=False, snapshots=False, exclude_old=False, exclude_new=False): """ Copy blobs between containers and storage accounts. This is a server-side copy operation therefore the command is asynchronous. :param str source_account: The account name of the source storage account :param str source_container: The source blob container :param str source_sas: The shared access signature used to access the source container. It is not required if either a connection string is given or the source container doesn't require a sas. :param str destination_container: The destination blob container :param str prefix: If option --recursive is specified, then this command interprets the specified the pattern given as a blob prefix. If option --recursive is not specified, then the given pattern matches the against exact blob names. :param bool recursive: Copy all the files to the given container and maintain the folder structure. :param bool snapshots: Copy both the blobs and their snapshots. :param bool exclude_old: Excludes an older source resource. The resource will not be copied if the last modified time of the source is the same or older than destination. :param bool exclude_new: Excludes a newer source resource. The resource will not be copied if the last modified time of the source is the same or newer than destination. :return: A BlobCopyTicket instance summarize the operations """ # TODO: # 1. Page the result list (using num_result and marker) # 2. Support connection string for source # 3. stop using 'baseblobservice.exists' function. it doesn't provide performance gain # since it invoked the get_blob_properties any way. # Question: # 1. Performance of creating a source blob service src_client = BlockBlobService(account_name=source_account, sas_token=source_sas) def _get_blob_name(source_blob): name = source_blob.name if source_blob.snapshot is not None: # the snapshot time string has seven digital in the microseconds, which make strptime # nearly unusable. therefore characters after dot are thrown away time_string = source_blob.snapshot[:source_blob.snapshot.rfind('.' )] snapshot_time = datetime.strptime(time_string, '%Y-%m-%dT%H:%M:%S') # insert the date time string before the file extension dot = name.rfind('.') dot = len(name) if dot == -1 else dot name = '{0}({1}){2}'.format( name[0:dot], snapshot_time.strftime('%Y-%m-%d %H%M%S'), name[dot:]) return name def _get_blob_url(source_blob): # to be removed once this issue is fixed: # https://github.com/Azure/azure-storage-python/issues/233 src_url = src_client.make_blob_url(source_container, source_blob.name, sas_token=source_sas) if source_blob.snapshot is not None: # this is a blob snapshot if '?' in src_url: src_url += '&snapshot=' + str(source_blob.snapshot) else: src_url += '?snapshot=' + str(source_blob.snapshot) return src_url def _copy_single_blob(source_blob): kwargs = { "container_name": destination_container, "blob_name": _get_blob_name(source_blob), "copy_source": _get_blob_url(source_blob) } if (exclude_new or exclude_old) and client.exists( destination_container, source_blob.name): if exclude_old: destination_blob = client.get_blob_properties( destination_container, source_blob.name) kwargs[ "source_if_modified_since"] = destination_blob.properties.last_modified if exclude_new: kwargs[ "destination_if_modified_since"] = source_blob.properties.last_modified return client.copy_blob(**kwargs) if recursive: source_blobs = src_client.list_blobs( source_container, prefix=prefix, include=Include(snapshots=True) if snapshots else None) elif src_client.exists(source_container, prefix): source_blobs = [ src_client.get_blob_properties(source_container, prefix) ] else: source_blobs = [] return [ BlobCopyResult(b.name, _copy_single_blob(b).id) for b in source_blobs ]