def generate_token(): blob_service_client = BlobServiceClient(account_url=config.URL, credential=config.SHARED_KEY) try: for i in blob_service_client.list_containers(): continue except: return 'cannot generate the sas token' container_client = blob_service_client.get_container_client("mycontainer") # container_token = generate_container_sas( # container_client.account_name, # container_client.container_name, # account_key=container_client.credential.account_key, # policy_id='my-access-policy-id' # ) sas_token = generate_account_sas( blob_service_client.account_name, account_key=blob_service_client.credential.account_key, resource_types=ResourceTypes(object=True), permission=AccountSasPermissions(read=True, write=True, add=True, create=True), expiry=datetime.utcnow() + timedelta(hours=1)) return sas_token
def add_sanitizers(test_proxy): add_remove_header_sanitizer(headers="Ocp-Apim-Subscription-Key") add_remove_header_sanitizer(headers="Retry-After") add_general_regex_sanitizer( value="fakeendpoint", regex="(?<=\\/\\/)[a-z-]+(?=\\.cognitiveservices\\.azure\\.com)" ) add_general_regex_sanitizer( regex="(?<=\\/\\/)[a-z]+(?=(?:|-secondary)\\.(?:table|blob|queue)\\.core\\.windows\\.net)", value="fakeendpoint", ) add_oauth_response_sanitizer() # run tests yield # Dogfood env uses a static storage account so we clean up the blob resources # This is unnecessary for AzureCloud where each storage account is deleted at the end of testing if is_live() and os.getenv("TRANSLATION_ENVIRONMENT") == "Dogfood": client = BlobServiceClient( "https://" + os.getenv("TRANSLATION_DOCUMENT_STORAGE_NAME") + ".blob.core.windows.net/", os.getenv("TRANSLATION_DOCUMENT_STORAGE_KEY") ) for container in client.list_containers(): client.delete_container(container)
def azblob_file(azblob_credentials, cloud_bucket_name, download_gcs_public_data, public=False): acc_url = f"https://{azblob_credentials['storage_account']}.blob.core.windows.net" azblob_client = BlobServiceClient( account_url=acc_url, credential=azblob_credentials["shared_key"]) container_name = cloud_bucket_name + random_char(3).lower() if public: container_name += "public" print(f"\nUpload dataset to private azure blob container {container_name}") if container_name not in [ cntr["name"] for cntr in azblob_client.list_containers() ]: if public: azblob_client.create_container(name=container_name, metadata=None, public_access="container") else: azblob_client.create_container(name=container_name, metadata=None, public_access=None) blob_client = azblob_client.get_blob_client(container_name, "myfile.csv") with open(download_gcs_public_data, "r") as f: blob_client.upload_blob(f.read(), blob_type="BlockBlob", overwrite=True) yield f"{container_name}/myfile.csv" azblob_client.delete_container(container_name) print( f"\nAzure Blob Container {container_name} is now marked for deletion")
def list_containers(account_name,sas_token,required_string=None): # Break URL into URL and token if not sas_token.startswith('?'): sas_token = '?' + sas_token storage_account_url_blob = 'https://' + account_name + '.blob.core.windows.net' blob_service_client = BlobServiceClient(account_url=storage_account_url_blob, credential=sas_token) container_iter = blob_service_client.list_containers(include_metadata=False) containers = [] for container in container_iter: name = container['name'] if required_string is None or required_string in name: containers.append(name) elif required_string is not None: print('Skipping container {}'.format(name)) print('Enumerated {} containers:'.format(len(containers))) print(containers) return containers
def upload_third_party(self): logger.info("uploading third-party tools from %s", self.third_party) account_name = self.results["deploy"]["fuzz-name"]["value"] key = self.results["deploy"]["fuzz-key"]["value"] account_url = "https://%s.blob.core.windows.net" % account_name client = BlobServiceClient(account_url, credential=key) containers = [x["name"] for x in client.list_containers()] for name in os.listdir(self.third_party): path = os.path.join(self.third_party, name) if not os.path.isdir(path): continue if name not in containers: client.create_container(name) expiry = datetime.utcnow() + timedelta(minutes=30) sas = generate_container_sas( account_name, name, account_key=key, permission=ContainerSasPermissions( read=True, write=True, delete=True, list=True ), expiry=expiry, ) url = "%s/%s?%s" % (account_url, name, sas) subprocess.check_output( [self.azcopy, "sync", path, url, "--delete-destination", "true"] )
def upload_instance_setup(self): logger.info("uploading instance-specific-setup from %s", self.instance_specific) account_name = self.results["deploy"]["func-name"]["value"] key = self.results["deploy"]["func-key"]["value"] account_url = "https://%s.blob.core.windows.net" % account_name client = BlobServiceClient(account_url, credential=key) if "instance-specific-setup" not in [ x["name"] for x in client.list_containers() ]: client.create_container("instance-specific-setup") expiry = datetime.utcnow() + timedelta(minutes=30) sas = generate_container_sas( account_name, "instance-specific-setup", account_key=key, permission=ContainerSasPermissions( read=True, write=True, delete=True, list=True ), expiry=expiry, ) url = "%s/%s?%s" % (account_url, "instance-specific-setup", sas) subprocess.check_output( [ self.azcopy, "sync", self.instance_specific, url, "--delete-destination", "true", ] )
def add_log_export(self) -> None: if not self.export_appinsights: logger.info("not exporting appinsights") return container_name = "app-insights" logger.info("adding appinsight log export") account_name = self.results["deploy"]["func_name"]["value"] key = self.results["deploy"]["func_key"]["value"] account_url = "https://%s.blob.core.windows.net" % account_name client = BlobServiceClient(account_url, credential=key) if container_name not in [x["name"] for x in client.list_containers()]: client.create_container(container_name) expiry = datetime.utcnow() + timedelta(days=2 * 365) # NOTE: as this is a long-lived SAS url, it should not be logged and only # used in the the later-on export_configurations.create() call sas = generate_container_sas( account_name, container_name, account_key=key, permission=ContainerSasPermissions(write=True), expiry=expiry, ) url = "%s/%s?%s" % (account_url, container_name, sas) record_types = ( "Requests, Event, Exceptions, Metrics, PageViews, " "PageViewPerformance, Rdd, PerformanceCounters, Availability") req = ApplicationInsightsComponentExportRequest( record_types=record_types, destination_type="Blob", is_enabled="true", destination_address=url, ) credential = AzureCliCredential() app_insight_client = ApplicationInsightsManagementClient( credential, subscription_id=self.get_subscription_id(), ) to_delete = [] for entry in app_insight_client.export_configurations.list( self.resource_group, self.application_name): if (entry.storage_name == account_name and entry.container_name == container_name): to_delete.append(entry.export_id) for export_id in to_delete: logger.info("replacing existing export: %s", export_id) app_insight_client.export_configurations.delete( self.resource_group, self.application_name, export_id) app_insight_client.export_configurations.create( self.resource_group, self.application_name, req)
def get_blob_containers_by_storage_account(self, storage_account_name, account_key): blob_service = BlobServiceClient( account_url=f'{storage_account_name}.blob.core.windows.net', credential=account_key) containers = blob_service.list_containers() return containers
def check_storage_account(account_name, key): blob_service_client = BlobServiceClient(ENDPOINT_URL.format(account_name), credential=key) containers = blob_service_client.list_containers(timeout=15) public_containers = list() for cont in containers: if cont.public_access is not None: public_containers.append(cont) return public_containers
def check_if_container_exists(self, BlobServiceClient, containername): """See if the provided container exists or not and returns boolean flag""" flag = False #get the list of containers using service client containers = BlobServiceClient.list_containers() for container in containers: if container.name == containername: flag = True break return flag
def get_container(blob_service_client: BlobServiceClient, container_name: str) -> bool: """ Get a container client """ logging.info('blob_service_client.list_containers()') logging.info(list(blob_service_client.list_containers())) try: _ = blob_service_client.get_container_client(container_name) container_client = blob_service_client.get_container_client( container_name) except: container_client = blob_service_client.create_container(container_name) return container_client
def list_blobs(): # 接続先BlobアカウントのURL作る blob_url = "https://{}.blob.core.windows.net".format( os.getenv("AZURE_STORAGE_ACCOUNT_NAME")) # DefaultAzureCredentialを使い、Blobに接続するためのCredentialを自動で取得する。 # DefaultAzureCredentialを使うと、次の順番でCredentialの取得を試みる。 # なので、Azure上ではManaged IDの資格情報、ローカル開発環境上ではVSCodeの資格情報が使われるといったことが自動的に行われる。 # 1. EnvironmentCredential # 環境変数に設定されてるCredentialを使う # https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.environmentcredential?view=azure-python # 2. ManagedIdentityCredential # AzureのManaged Identityを使う # https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.managedidentitycredential?view=azure-python # 3. SharedTokenCacheCredential # WindowsのVisual Studio等でログインした際のCredentialを使う # https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.sharedtokencachecredential?view=azure-python # 4. VisualStudioCodeCredential # Visual Studio CodeのAzure Account拡張機能でログインした際のCredentialを使う。 # Windows、macOS、Linux対応。 # https://marketplace.visualstudio.com/items?itemName=ms-vscode.azure-account # https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.visualstudiocodecredential?view=azure-python # 5. AzureCliCredential # AzureのCLIでログインした際のCredentialを使う。 # https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.azureclicredential?view=azure-python cred = DefaultAzureCredential() # Blobに接続する際、パラメータを明示したExponentialRetryを使う。 # デフォルトだとExponentialRetryが使われるがその際のデフォルトパラメータは # initial_backoff=15, increment_base=3, retry_total=3, random_jitter_range=3 # # なので、リトライ分含め合計4回接続を試み、リトライの間隔は # (15+3^1) = 18±3秒、(15+3^2) = 24±3秒、(15+3^3) = 42±3秒 # になるので、Flaskに接続してくるclientのHTTP Connectionを長時間保持したままになってしまう。 # # それがイヤだったら明示的にパラメータを設定して早めにBlobに対してリトライをかける。 # このコードの例だと、 # (0.5+1.2^1) = 1.7±0.2秒、(0.5+1.2^2) = 1.94±0.2秒、(0.5+1.2^3) = 2.228±0.2秒 # の間隔でのリトライになる。 retry = ExponentialRetry(initial_backoff=0.5, increment_base=1.2, random_jitter_range=0.2) client = BlobServiceClient(blob_url, cred, retry_policy=retry) containers = client.list_containers() container_names = [ container.get("name", "unknown") for container in containers ] return ", ".join(container_names)
def check_storage_account(account_name, key): blob_service_client = BlobServiceClient(ENDPOINT_URL.format(account_name), credential=key) containers = blob_service_client.list_containers(timeout=15) public_containers = list() try: for cont in containers: if cont.public_access is not None: public_containers.append(cont) except azure.core.exceptions.HttpResponseError: print( "\t\t[-] Could not scan account {}, skipping".format(account_name), flush=True) return public_containers
def test_create_container_with_default_cpk_n_deny_override( self, resource_group, location, storage_account, storage_account_key): # Arrange bsc = BlobServiceClient(self.account_url(storage_account, "blob"), credential=storage_account_key, connection_data_block_size=1024, max_single_put_size=1024, min_large_block_upload_threshold=1024, max_block_size=1024, max_page_size=1024) container_client = bsc.create_container( 'denyoverridecpkcontainer', container_encryption_scope= TEST_CONTAINER_ENCRYPTION_KEY_SCOPE_DENY_OVERRIDE) container_props = container_client.get_container_properties() self.assertEqual( container_props.encryption_scope.default_encryption_scope, TEST_CONTAINER_ENCRYPTION_KEY_SCOPE.default_encryption_scope) self.assertEqual( container_props.encryption_scope.prevent_encryption_scope_override, True) for container in bsc.list_containers( name_starts_with='denyoverridecpkcontainer'): self.assertEqual( container_props.encryption_scope.default_encryption_scope, TEST_CONTAINER_ENCRYPTION_KEY_SCOPE.default_encryption_scope) self.assertEqual( container_props.encryption_scope. prevent_encryption_scope_override, True) blob_client = container_client.get_blob_client("appendblob") # It's not allowed to set encryption scope on the blob when the container denies encryption scope override. with self.assertRaises(HttpResponseError): blob_client.upload_blob(b'aaaa', BlobType.AppendBlob, encryption_scope=TEST_ENCRYPTION_KEY_SCOPE) resp = blob_client.upload_blob(b'aaaa', BlobType.AppendBlob) self.assertEqual( resp['encryption_scope'], TEST_CONTAINER_ENCRYPTION_KEY_SCOPE.default_encryption_scope) container_client.delete_container()
def check_storage_account(account_name, key): blob_service_client = BlobServiceClient(ENDPOINT_URL.format(account_name), credential=key) containers = blob_service_client.list_containers(timeout=15) public_containers = list() for cont,e in iterator_wrapper(containers): if cont == STOP_SCAN_FLAG: break if e : if type(e) is not StopIteration: print("\t\t[-] Could not scan the container of the account{} due to the error{}. skipping".format(account_name,e), flush=True) continue else: break if cont.public_access is not None: public_containers.append(cont) return public_containers
def main(req: func.HttpRequest) -> func.HttpResponse: logging.info('Python HTTP trigger function processed a request.') #demo starts here new_module.new_func() #shows custom module import action = req.params["action"] if action == "SQL": server = "<SECRET>" database = "<SECRET>" username = "******" password = "******" cnxn = pyodbc.connect( 'DRIVER={ODBC Driver 17 for SQL Server};SERVER=' + server + ';DATABASE=' + database + ';UID=' + username + ';PWD=' + password) cursor = cnxn.cursor() #logging.info(cursor) cursor.execute("select * from testTable") row = cursor.fetchone() json_array = [] for i in row: json_array.append(i) json_result = json.dumps({"data": json_array}) return func.HttpResponse(json_result) elif action == "blob": #blob test logging.info("blob action") credential = "<SECRET>" service = BlobServiceClient(account_url="<SECRET>", credential=credential) container_names = next(service.list_containers()) #logging.info(container_names) container = service.get_container_client("<SECRET>") blob_names = next(container.list_blobs()) #logging.info(blob_names) blob = container.get_blob_client("<SECRET>") dl_stream = blob.download_blob() #logging.info(dl_stream.content_as_text()) return func.HttpResponse(dl_stream.content_as_text()) else: return func.HttpResponse("### WRONG ACTION, CHECK API. ###")
def test_create_container_with_default_cpk_n(self, resource_group, location, storage_account, storage_account_key): # Arrange bsc = BlobServiceClient(self.account_url(storage_account, "blob"), credential=storage_account_key, connection_data_block_size=1024, max_single_put_size=1024, min_large_block_upload_threshold=1024, max_block_size=1024, max_page_size=1024) container_client = bsc.create_container( 'cpkcontainer', container_encryption_scope=TEST_CONTAINER_ENCRYPTION_KEY_SCOPE) container_props = container_client.get_container_properties() self.assertEqual( container_props.encryption_scope.default_encryption_scope, TEST_CONTAINER_ENCRYPTION_KEY_SCOPE.default_encryption_scope) self.assertEqual( container_props.encryption_scope.prevent_encryption_scope_override, False) for container in bsc.list_containers(name_starts_with='cpkcontainer'): self.assertEqual( container_props.encryption_scope.default_encryption_scope, TEST_CONTAINER_ENCRYPTION_KEY_SCOPE.default_encryption_scope) self.assertEqual( container_props.encryption_scope. prevent_encryption_scope_override, False) blob_client = container_client.get_blob_client("appendblob") # providing encryption scope when upload the blob resp = blob_client.upload_blob( b'aaaa', BlobType.AppendBlob, encryption_scope=TEST_ENCRYPTION_KEY_SCOPE) # Use the provided encryption scope on the blob self.assertEqual(resp['encryption_scope'], TEST_ENCRYPTION_KEY_SCOPE) container_client.delete_container()
def add_instance_id(self) -> None: logger.info("setting instance_id log export") container_name = "base-config" blob_name = "instance_id" account_name = self.results["deploy"]["func-name"]["value"] key = self.results["deploy"]["func-key"]["value"] account_url = "https://%s.blob.core.windows.net" % account_name client = BlobServiceClient(account_url, credential=key) if container_name not in [x["name"] for x in client.list_containers()]: client.create_container(container_name) blob_client = client.get_blob_client(container_name, blob_name) if blob_client.exists(): logger.debug("instance_id already exists") instance_id = uuid.UUID(blob_client.download_blob().readall().decode()) else: logger.debug("creating new instance_id") instance_id = uuid.uuid4() blob_client.upload_blob(str(instance_id)) logger.info("instance_id: %s", instance_id)
def connect_container(service: BlobServiceClient, container: str, create=True) -> ContainerClient: ''' Parse options for container name to upload, compare to list to see if container needs to be created ''' container_list = [x for x in service.list_containers()] container_names = [x['name'] for x in container_list] container_client = service.get_container_client(container) if container not in container_names and create: # Meaning no container setup yet. operation = container_client.create_container() if operation['error_code'] is not None: raise Exception(operation['error_code']) else: log.info(f"Created container {container}, request_id: {operation['request_id']}.") elif container not in container_names and not create: log.error(f'Container {container} not found.') exit(1) return container_client
def upload_tools(self) -> None: logger.info("uploading tools from %s", self.tools) account_name = self.results["deploy"]["func-name"]["value"] key = self.results["deploy"]["func-key"]["value"] account_url = "https://%s.blob.core.windows.net" % account_name client = BlobServiceClient(account_url, credential=key) if "tools" not in [x["name"] for x in client.list_containers()]: client.create_container("tools") expiry = datetime.utcnow() + timedelta(minutes=30) sas = generate_container_sas( account_name, "tools", account_key=key, permission=ContainerSasPermissions(read=True, write=True, delete=True, list=True), expiry=expiry, ) url = "%s/%s?%s" % (account_url, "tools", sas) subprocess.check_output([ self.azcopy, "copy", os.path.join(self.tools, "*"), url, "--overwrite=true", "--recursive=true", ]) subprocess.check_output([ self.azcopy, "sync", self.tools, url, "--delete-destination", "true" ])
class DataLakeServiceClient(StorageAccountHostsMixin): """A client to interact with the DataLake Service at the account level. This client provides operations to retrieve and configure the account properties as well as list, create and delete file systems within the account. For operations relating to a specific file system, directory or file, clients for those entities can also be retrieved using the `get_client` functions. :ivar str url: The full endpoint URL to the datalake service endpoint. :ivar str primary_endpoint: The full primary endpoint URL. :ivar str primary_hostname: The hostname of the primary endpoint. :param str account_url: The URL to the DataLake storage account. Any other entities included in the URL path (e.g. file system or file) will be discarded. This URL can be optionally authenticated with a SAS token. :param credential: The credentials with which to authenticate. This is optional if the account URL already has a SAS token. The value can be a SAS token string, an instance of a AzureSasCredential from azure.core.credentials, an account shared access key, or an instance of a TokenCredentials class from azure.identity. If the resource URI already contains a SAS token, this will be ignored in favor of an explicit credential - except in the case of AzureSasCredential, where the conflicting SAS tokens will raise a ValueError. .. admonition:: Example: .. literalinclude:: ../samples/datalake_samples_service.py :start-after: [START create_datalake_service_client] :end-before: [END create_datalake_service_client] :language: python :dedent: 8 :caption: Creating the DataLakeServiceClient from connection string. .. literalinclude:: ../samples/datalake_samples_service.py :start-after: [START create_datalake_service_client_oauth] :end-before: [END create_datalake_service_client_oauth] :language: python :dedent: 8 :caption: Creating the DataLakeServiceClient with Azure Identity credentials. """ def __init__( self, account_url, # type: str credential=None, # type: Optional[Any] **kwargs # type: Any ): # type: (...) -> None try: if not account_url.lower().startswith('http'): account_url = "https://" + account_url except AttributeError: raise ValueError("Account URL must be a string.") parsed_url = urlparse(account_url.rstrip('/')) if not parsed_url.netloc: raise ValueError("Invalid URL: {}".format(account_url)) blob_account_url = convert_dfs_url_to_blob_url(account_url) self._blob_account_url = blob_account_url self._blob_service_client = BlobServiceClient(blob_account_url, credential, **kwargs) self._blob_service_client._hosts[LocationMode.SECONDARY] = "" #pylint: disable=protected-access _, sas_token = parse_query(parsed_url.query) self._query_str, self._raw_credential = self._format_query_string( sas_token, credential) super(DataLakeServiceClient, self).__init__(parsed_url, service='dfs', credential=self._raw_credential, **kwargs) # ADLS doesn't support secondary endpoint, make sure it's empty self._hosts[LocationMode.SECONDARY] = "" def __enter__(self): self._blob_service_client.__enter__() return self def __exit__(self, *args): self._blob_service_client.close() def close(self): # type: () -> None """ This method is to close the sockets opened by the client. It need not be used when using with a context manager. """ self._blob_service_client.close() def _format_url(self, hostname): """Format the endpoint URL according to hostname """ formated_url = "{}://{}/{}".format(self.scheme, hostname, self._query_str) return formated_url @classmethod def from_connection_string( cls, conn_str, # type: str credential=None, # type: Optional[Any] **kwargs # type: Any ): # type: (...) -> DataLakeServiceClient """ Create DataLakeServiceClient from a Connection String. :param str conn_str: A connection string to an Azure Storage account. :param credential: The credentials with which to authenticate. This is optional if the account URL already has a SAS token, or the connection string already has shared access key values. The value can be a SAS token string, an instance of a AzureSasCredential from azure.core.credentials, an account shared access key, or an instance of a TokenCredentials class from azure.identity. Credentials provided here will take precedence over those in the connection string. :return a DataLakeServiceClient :rtype ~azure.storage.filedatalake.DataLakeServiceClient .. admonition:: Example: .. literalinclude:: ../samples/datalake_samples_file_system.py :start-after: [START create_data_lake_service_client_from_conn_str] :end-before: [END create_data_lake_service_client_from_conn_str] :language: python :dedent: 8 :caption: Creating the DataLakeServiceClient from a connection string. """ account_url, _, credential = parse_connection_str( conn_str, credential, 'dfs') return cls(account_url, credential=credential, **kwargs) def get_user_delegation_key( self, key_start_time, # type: datetime key_expiry_time, # type: datetime **kwargs # type: Any ): # type: (...) -> UserDelegationKey """ Obtain a user delegation key for the purpose of signing SAS tokens. A token credential must be present on the service object for this request to succeed. :param ~datetime.datetime key_start_time: A DateTime value. Indicates when the key becomes valid. :param ~datetime.datetime key_expiry_time: A DateTime value. Indicates when the key stops being valid. :keyword int timeout: The timeout parameter is expressed in seconds. :return: The user delegation key. :rtype: ~azure.storage.filedatalake.UserDelegationKey .. admonition:: Example: .. literalinclude:: ../samples/datalake_samples_service.py :start-after: [START get_user_delegation_key] :end-before: [END get_user_delegation_key] :language: python :dedent: 8 :caption: Get user delegation key from datalake service client. """ delegation_key = self._blob_service_client.get_user_delegation_key( key_start_time=key_start_time, key_expiry_time=key_expiry_time, **kwargs) # pylint: disable=protected-access return UserDelegationKey._from_generated(delegation_key) # pylint: disable=protected-access def list_file_systems( self, name_starts_with=None, # type: Optional[str] include_metadata=None, # type: Optional[bool] **kwargs): # type: (...) -> ItemPaged[FileSystemProperties] """Returns a generator to list the file systems under the specified account. The generator will lazily follow the continuation tokens returned by the service and stop when all file systems have been returned. :param str name_starts_with: Filters the results to return only file systems whose names begin with the specified prefix. :param bool include_metadata: Specifies that file system metadata be returned in the response. The default value is `False`. :keyword int results_per_page: The maximum number of file system names to retrieve per API call. If the request does not specify the server will return up to 5,000 items per page. :keyword int timeout: The timeout parameter is expressed in seconds. :keyword bool include_deleted: Specifies that deleted file systems to be returned in the response. This is for file system restore enabled account. The default value is `False`. .. versionadded:: 12.3.0 :returns: An iterable (auto-paging) of FileSystemProperties. :rtype: ~azure.core.paging.ItemPaged[~azure.storage.filedatalake.FileSystemProperties] .. admonition:: Example: .. literalinclude:: ../samples/datalake_samples_service.py :start-after: [START list_file_systems] :end-before: [END list_file_systems] :language: python :dedent: 8 :caption: Listing the file systems in the datalake service. """ item_paged = self._blob_service_client.list_containers( name_starts_with=name_starts_with, include_metadata=include_metadata, **kwargs) # pylint: disable=protected-access item_paged._page_iterator_class = FileSystemPropertiesPaged # pylint: disable=protected-access return item_paged def create_file_system( self, file_system, # type: Union[FileSystemProperties, str] metadata=None, # type: Optional[Dict[str, str]] public_access=None, # type: Optional[PublicAccess] **kwargs): # type: (...) -> FileSystemClient """Creates a new file system under the specified account. If the file system with the same name already exists, a ResourceExistsError will be raised. This method returns a client with which to interact with the newly created file system. :param str file_system: The name of the file system to create. :param metadata: A dict with name-value pairs to associate with the file system as metadata. Example: `{'Category':'test'}` :type metadata: dict(str, str) :param public_access: Possible values include: file system, file. :type public_access: ~azure.storage.filedatalake.PublicAccess :keyword int timeout: The timeout parameter is expressed in seconds. :rtype: ~azure.storage.filedatalake.FileSystemClient .. admonition:: Example: .. literalinclude:: ../samples/datalake_samples_service.py :start-after: [START create_file_system_from_service_client] :end-before: [END create_file_system_from_service_client] :language: python :dedent: 8 :caption: Creating a file system in the datalake service. """ file_system_client = self.get_file_system_client(file_system) file_system_client.create_file_system(metadata=metadata, public_access=public_access, **kwargs) return file_system_client def _rename_file_system(self, name, new_name, **kwargs): # type: (str, str, **Any) -> FileSystemClient """Renames a filesystem. Operation is successful only if the source filesystem exists. :param str name: The name of the filesystem to rename. :param str new_name: The new filesystem name the user wants to rename to. :keyword lease: Specify this to perform only if the lease ID given matches the active lease ID of the source filesystem. :paramtype lease: ~azure.storage.filedatalake.DataLakeLeaseClient or str :keyword int timeout: The timeout parameter is expressed in seconds. :rtype: ~azure.storage.filedatalake.FileSystemClient """ self._blob_service_client._rename_container(name, new_name, **kwargs) # pylint: disable=protected-access renamed_file_system = self.get_file_system_client(new_name) return renamed_file_system def undelete_file_system(self, name, deleted_version, **kwargs): # type: (str, str, **Any) -> FileSystemClient """Restores soft-deleted filesystem. Operation will only be successful if used within the specified number of days set in the delete retention policy. .. versionadded:: 12.3.0 This operation was introduced in API version '2019-12-12'. :param str name: Specifies the name of the deleted filesystem to restore. :param str deleted_version: Specifies the version of the deleted filesystem to restore. :keyword int timeout: The timeout parameter is expressed in seconds. :rtype: ~azure.storage.filedatalake.FileSystemClient """ new_name = kwargs.pop('new_name', None) file_system = self.get_file_system_client(new_name or name) self._blob_service_client.undelete_container(name, deleted_version, new_name=new_name, **kwargs) # pylint: disable=protected-access return file_system def delete_file_system( self, file_system, # type: Union[FileSystemProperties, str] **kwargs): # type: (...) -> FileSystemClient """Marks the specified file system for deletion. The file system and any files contained within it are later deleted during garbage collection. If the file system is not found, a ResourceNotFoundError will be raised. :param file_system: The file system to delete. This can either be the name of the file system, or an instance of FileSystemProperties. :type file_system: str or ~azure.storage.filedatalake.FileSystemProperties :keyword lease: If specified, delete_file_system only succeeds if the file system's lease is active and matches this ID. Required if the file system has an active lease. :paramtype lease: ~azure.storage.filedatalake.DataLakeLeaseClient or str :keyword ~datetime.datetime if_modified_since: A DateTime value. Azure expects the date value passed in to be UTC. If timezone is included, any non-UTC datetimes will be converted to UTC. If a date is passed in without timezone info, it is assumed to be UTC. Specify this header to perform the operation only if the resource has been modified since the specified time. :keyword ~datetime.datetime if_unmodified_since: A DateTime value. Azure expects the date value passed in to be UTC. If timezone is included, any non-UTC datetimes will be converted to UTC. If a date is passed in without timezone info, it is assumed to be UTC. Specify this header to perform the operation only if the resource has not been modified since the specified date/time. :keyword str etag: An ETag value, or the wildcard character (*). Used to check if the resource has changed, and act according to the condition specified by the `match_condition` parameter. :keyword ~azure.core.MatchConditions match_condition: The match condition to use upon the etag. :keyword int timeout: The timeout parameter is expressed in seconds. :rtype: None .. admonition:: Example: .. literalinclude:: ../samples/datalake_samples_service.py :start-after: [START delete_file_system_from_service_client] :end-before: [END delete_file_system_from_service_client] :language: python :dedent: 8 :caption: Deleting a file system in the datalake service. """ file_system_client = self.get_file_system_client(file_system) file_system_client.delete_file_system(**kwargs) return file_system_client def get_file_system_client( self, file_system # type: Union[FileSystemProperties, str] ): # type: (...) -> FileSystemClient """Get a client to interact with the specified file system. The file system need not already exist. :param file_system: The file system. This can either be the name of the file system, or an instance of FileSystemProperties. :type file_system: str or ~azure.storage.filedatalake.FileSystemProperties :returns: A FileSystemClient. :rtype: ~azure.storage.filedatalake.FileSystemClient .. admonition:: Example: .. literalinclude:: ../samples/datalake_samples_file_system.py :start-after: [START create_file_system_client_from_service] :end-before: [END create_file_system_client_from_service] :language: python :dedent: 8 :caption: Getting the file system client to interact with a specific file system. """ try: file_system_name = file_system.name except AttributeError: file_system_name = file_system _pipeline = Pipeline( transport=TransportWrapper(self._pipeline._transport ), # pylint: disable = protected-access policies=self._pipeline. _impl_policies # pylint: disable = protected-access ) return FileSystemClient( self.url, file_system_name, credential=self._raw_credential, _configuration=self._config, _pipeline=_pipeline, _hosts=self._hosts, require_encryption=self.require_encryption, key_encryption_key=self.key_encryption_key, key_resolver_function=self.key_resolver_function) def get_directory_client( self, file_system, # type: Union[FileSystemProperties, str] directory # type: Union[DirectoryProperties, str] ): # type: (...) -> DataLakeDirectoryClient """Get a client to interact with the specified directory. The directory need not already exist. :param file_system: The file system that the directory is in. This can either be the name of the file system, or an instance of FileSystemProperties. :type file_system: str or ~azure.storage.filedatalake.FileSystemProperties :param directory: The directory with which to interact. This can either be the name of the directory, or an instance of DirectoryProperties. :type directory: str or ~azure.storage.filedatalake.DirectoryProperties :returns: A DataLakeDirectoryClient. :rtype: ~azure.storage.filedatalake.DataLakeDirectoryClient .. admonition:: Example: .. literalinclude:: ../samples/datalake_samples_service.py :start-after: [START get_directory_client_from_service_client] :end-before: [END get_directory_client_from_service_client] :language: python :dedent: 8 :caption: Getting the directory client to interact with a specific directory. """ try: file_system_name = file_system.name except AttributeError: file_system_name = file_system try: directory_name = directory.name except AttributeError: directory_name = directory _pipeline = Pipeline( transport=TransportWrapper(self._pipeline._transport ), # pylint: disable = protected-access policies=self._pipeline. _impl_policies # pylint: disable = protected-access ) return DataLakeDirectoryClient( self.url, file_system_name, directory_name=directory_name, credential=self._raw_credential, _configuration=self._config, _pipeline=_pipeline, _hosts=self._hosts, require_encryption=self.require_encryption, key_encryption_key=self.key_encryption_key, key_resolver_function=self.key_resolver_function) def get_file_client( self, file_system, # type: Union[FileSystemProperties, str] file_path # type: Union[FileProperties, str] ): # type: (...) -> DataLakeFileClient """Get a client to interact with the specified file. The file need not already exist. :param file_system: The file system that the file is in. This can either be the name of the file system, or an instance of FileSystemProperties. :type file_system: str or ~azure.storage.filedatalake.FileSystemProperties :param file_path: The file with which to interact. This can either be the full path of the file(from the root directory), or an instance of FileProperties. eg. directory/subdirectory/file :type file_path: str or ~azure.storage.filedatalake.FileProperties :returns: A DataLakeFileClient. :rtype: ~azure.storage.filedatalake.DataLakeFileClient .. admonition:: Example: .. literalinclude:: ../samples/datalake_samples_service.py :start-after: [START get_file_client_from_service_client] :end-before: [END get_file_client_from_service_client] :language: python :dedent: 8 :caption: Getting the file client to interact with a specific file. """ try: file_system_name = file_system.name except AttributeError: file_system_name = file_system try: file_path = file_path.name except AttributeError: pass _pipeline = Pipeline( transport=TransportWrapper(self._pipeline._transport ), # pylint: disable = protected-access policies=self._pipeline. _impl_policies # pylint: disable = protected-access ) return DataLakeFileClient( self.url, file_system_name, file_path=file_path, credential=self._raw_credential, _hosts=self._hosts, _configuration=self._config, _pipeline=_pipeline, require_encryption=self.require_encryption, key_encryption_key=self.key_encryption_key, key_resolver_function=self.key_resolver_function) def set_service_properties(self, **kwargs): # type: (**Any) -> None """Sets the properties of a storage account's Datalake service, including Azure Storage Analytics. .. versionadded:: 12.4.0 This operation was introduced in API version '2020-06-12'. If an element (e.g. analytics_logging) is left as None, the existing settings on the service for that functionality are preserved. :keyword analytics_logging: Groups the Azure Analytics Logging settings. :type analytics_logging: ~azure.storage.filedatalake.AnalyticsLogging :keyword hour_metrics: The hour metrics settings provide a summary of request statistics grouped by API in hourly aggregates. :type hour_metrics: ~azure.storage.filedatalake.Metrics :keyword minute_metrics: The minute metrics settings provide request statistics for each minute. :type minute_metrics: ~azure.storage.filedatalake.Metrics :keyword cors: You can include up to five CorsRule elements in the list. If an empty list is specified, all CORS rules will be deleted, and CORS will be disabled for the service. :type cors: list[~azure.storage.filedatalake.CorsRule] :keyword str target_version: Indicates the default version to use for requests if an incoming request's version is not specified. :keyword delete_retention_policy: The delete retention policy specifies whether to retain deleted files/directories. It also specifies the number of days and versions of file/directory to keep. :type delete_retention_policy: ~azure.storage.filedatalake.RetentionPolicy :keyword static_website: Specifies whether the static website feature is enabled, and if yes, indicates the index document and 404 error document to use. :type static_website: ~azure.storage.filedatalake.StaticWebsite :keyword int timeout: The timeout parameter is expressed in seconds. :rtype: None """ return self._blob_service_client.set_service_properties(**kwargs) # pylint: disable=protected-access def get_service_properties(self, **kwargs): # type: (**Any) -> Dict[str, Any] """Gets the properties of a storage account's datalake service, including Azure Storage Analytics. .. versionadded:: 12.4.0 This operation was introduced in API version '2020-06-12'. :keyword int timeout: The timeout parameter is expressed in seconds. :returns: An object containing datalake service properties such as analytics logging, hour/minute metrics, cors rules, etc. :rtype: Dict[str, Any] """ props = self._blob_service_client.get_service_properties(**kwargs) # pylint: disable=protected-access return get_datalake_service_properties(props)
class AzureBlobFileSystem(AbstractFileSystem): """ Access Azure Datalake Gen2 and Azure Storage if it were a file system using Multiprotocol Access Parameters ---------- account_name: str The storage account name. This is used to authenticate requests signed with an account key and to construct the storage endpoint. It is required unless a connection string is given, or if a custom domain is used with anonymous authentication. account_key: str The storage account key. This is used for shared key authentication. If any of account key, sas token or client_id is specified, anonymous access will be used. sas_token: str A shared access signature token to use to authenticate requests instead of the account key. If account key and sas token are both specified, account key will be used to sign. If any of account key, sas token or client_id are specified, anonymous access will be used. request_session: Session The session object to use for http requests. connection_string: str If specified, this will override all other parameters besides request session. See http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ for the connection string format. socket_timeout: int If specified, this will override the default socket timeout. The timeout specified is in seconds. See DEFAULT_SOCKET_TIMEOUT in _constants.py for the default value. token_credential: TokenCredential A token credential used to authenticate HTTPS requests. The token value should be updated before its expiration. blocksize: int The block size to use for download/upload operations. Defaults to the value of ``BlockBlobService.MAX_BLOCK_SIZE`` client_id: str Client ID to use when authenticating using an AD Service Principal client/secret. client_secret: str Client secret to use when authenticating using an AD Service Principal client/secret. tenant_id: str Tenant ID to use when authenticating using an AD Service Principal client/secret. Examples -------- >>> abfs = AzureBlobFileSystem(account_name="XXXX", account_key="XXXX", container_name="XXXX") >>> abfs.ls('') ** Sharded Parquet & csv files can be read as: ** ------------------------------------------ ddf = dd.read_csv('abfs://container_name/folder/*.csv', storage_options={ ... 'account_name': ACCOUNT_NAME, 'account_key': ACCOUNT_KEY}) ddf = dd.read_parquet('abfs://container_name/folder.parquet', storage_options={ ... 'account_name': ACCOUNT_NAME, 'account_key': ACCOUNT_KEY,}) """ protocol = "abfs" def __init__( self, account_name: str, account_key: str = None, connection_string: str = None, credential: str = None, sas_token: str = None, request_session=None, socket_timeout: int = None, token_credential=None, blocksize: int = create_configuration(storage_sdk="blob").max_block_size, client_id: str = None, client_secret: str = None, tenant_id: str = None, ): AbstractFileSystem.__init__(self) self.account_name = account_name self.account_key = account_key self.connection_string = connection_string self.credential = credential self.sas_token = sas_token self.request_session = request_session self.socket_timeout = socket_timeout self.token_credential = token_credential self.blocksize = blocksize self.client_id = client_id self.client_secret = client_secret self.tenant_id = tenant_id if ( self.token_credential is None and self.account_key is None and self.sas_token is None and self.client_id is not None ): self.token_credential = self._get_token_from_service_principal() self.do_connect() @classmethod def _strip_protocol(cls, path: str): """ Remove the protocol from the input path Parameters ---------- path: str Path to remove the protocol from Returns ------- str Returns a path without the protocol """ logging.debug(f"_strip_protocol for {path}") ops = infer_storage_options(path) # we need to make sure that the path retains # the format {host}/{path} # here host is the container_name if ops.get("host", None): ops["path"] = ops["host"] + ops["path"] ops["path"] = ops["path"].lstrip("/") logging.debug(f"_strip_protocol({path}) = {ops}") return ops["path"] def _get_token_from_service_principal(self): """ Create a TokenCredential given a client_id, client_secret and tenant_id Returns ------- TokenCredential """ from azure.common.credentials import ServicePrincipalCredentials from azure.storage.common import TokenCredential sp_cred = ServicePrincipalCredentials( client_id=self.client_id, secret=self.client_secret, tenant=self.tenant_id, resource="https://storage.azure.com/", ) token_cred = TokenCredential(sp_cred.token["access_token"]) return token_cred def do_connect(self): """Connect to the BlobServiceClient, using user-specified connection details. Tries credentials first, then connection string and finally account key Raises ------ ValueError if none of the connection details are available """ self.account_url: str = f"https://{self.account_name}.blob.core.windows.net" if self.credential is not None: self.service_client = BlobServiceClient( account_url=self.account_url, credential=self.credential ) elif self.connection_string is not None: self.service_client = BlobServiceClient.from_connection_string( conn_str=self.connection_string ) elif self.account_key is not None: self.service_client = BlobServiceClient( account_url=self.account_url, credential=self.account_key ) else: raise ValueError("unable to connect with provided params!!") def split_path(self, path, delimiter="/", return_container: bool = False, **kwargs): """ Normalize ABFS path string into bucket and key. Parameters ---------- path : string Input path, like `abfs://my_container/path/to/file` delimiter: string Delimiter used to split the path return_container: bool Examples -------- >>> split_path("abfs://my_container/path/to/file") ['my_container', 'path/to/file'] """ if path in ["", delimiter]: return "", "" path = self._strip_protocol(path) path = path.lstrip(delimiter) if "/" not in path: # this means path is the container_name return path, "" else: return path.split(delimiter, 1) # def _generate_blobs(self, *args, **kwargs): # """Follow next_marker to get ALL results.""" # logging.debug("running _generate_blobs...") # blobs = self.blob_fs.list_blobs(*args, **kwargs) # yield from blobs # while blobs.next_marker: # logging.debug(f"following next_marker {blobs.next_marker}") # kwargs["marker"] = blobs.next_marker # blobs = self.blob_fs.list_blobs(*args, **kwargs) # yield from blobs # def _matches( # self, container_name, path, as_directory=False, delimiter="/", **kwargs # ): # """check if the path returns an exact match""" # path = path.rstrip(delimiter) # gen = self.blob_fs.list_blob_names( # container_name=container_name, # prefix=path, # delimiter=delimiter, # num_results=None, # ) # contents = list(gen) # if not contents: # return False # if as_directory: # return contents[0] == path + delimiter # else: # return contents[0] == path def ls( self, path: str, detail: bool = False, invalidate_cache: bool = True, delimiter: str = "/", return_glob: bool = False, **kwargs, ): """ Create a list of blob names from a blob container Parameters ---------- path: str Path to an Azure Blob with its container name detail: bool If False, return a list of blob names, else a list of dictionaries with blob details invalidate_cache: bool If True, do not use the cache delimiter: str Delimiter used to split paths return_glob: bool """ logging.debug(f"abfs.ls() is searching for {path}") container, path = self.split_path(path) if (container in ["", delimiter]) and (path in ["", delimiter]): # This is the case where only the containers are being returned logging.info( "Returning a list of containers in the azure blob storage account" ) if detail: contents = self.service_client.list_containers(include_metadata=True) return self._details(contents) else: contents = self.service_client.list_containers() return [f"{c.name}{delimiter}" for c in contents] else: if container not in ["", delimiter]: # This is the case where the container name is passed container_client = self.service_client.get_container_client( container=container ) blobs = container_client.walk_blobs(name_starts_with=path) try: blobs = [blob for blob in blobs] except Exception: raise FileNotFoundError if len(blobs) > 1: if return_glob: return self._details(blobs, return_glob=True) if detail: return self._details(blobs) else: return [ f"{blob.container}{delimiter}{blob.name}" for blob in blobs ] elif len(blobs) == 1: if (blobs[0].name.rstrip(delimiter) == path) and not blobs[ 0 ].has_key( # NOQA "blob_type" ): path = blobs[0].name blobs = container_client.walk_blobs(name_starts_with=path) if return_glob: return self._details(blobs, return_glob=True) if detail: return self._details(blobs) else: return [ f"{blob.container}{delimiter}{blob.name}" for blob in blobs ] elif isinstance(blobs[0], BlobPrefix): if detail: for blob_page in blobs: return self._details(blob_page) else: outblobs = [] for blob_page in blobs: for blob in blob_page: outblobs.append( f"{blob.container}{delimiter}{blob.name}" ) return outblobs elif blobs[0]["blob_type"] == "BlockBlob": if detail: return self._details(blobs) else: return [ f"{blob.container}{delimiter}{blob.name}" for blob in blobs ] elif isinstance(blobs[0], ItemPaged): outblobs = [] for page in blobs: for b in page: outblobs.append(b) else: raise FileNotFoundError( f"Unable to identify blobs in {path} for {blobs[0].name}" ) elif len(blobs) == 0: if return_glob or (path in ["", delimiter]): return [] else: raise FileNotFoundError else: raise FileNotFoundError def _details(self, contents, delimiter="/", return_glob: bool = False, **kwargs): """ Return a list of dictionaries of specifying details about the contents Parameters ---------- contents delimiter: str Delimiter used to separate containers and files return_glob: bool Returns ------- List of dicts Returns details about the contents, such as name, size and type """ pathlist = [] for c in contents: data = {} if c.has_key("container"): # NOQA data["name"] = f"{c.container}{delimiter}{c.name}" if c.has_key("size"): # NOQA data["size"] = c.size else: data["size"] = 0 if data["size"] == 0: data["type"] = "directory" else: data["type"] = "file" else: data["name"] = f"{c.name}{delimiter}" data["size"] = 0 data["type"] = "directory" if return_glob: data["name"] = data["name"].rstrip("/") pathlist.append(data) return pathlist def walk(self, path: str, maxdepth=None, **kwargs): """ Return all files belows path List all files, recursing into subdirectories; output is iterator-style, like ``os.walk()``. For a simple list of files, ``find()`` is available. Note that the "files" outputted will include anything that is not a directory, such as links. Parameters ---------- path: str Root to recurse into maxdepth: int Maximum recursion depth. None means limitless, but not recommended on link-based file-systems. **kwargs are passed to ``ls`` """ path = self._strip_protocol(path) full_dirs = {} dirs = {} files = {} detail = kwargs.pop("detail", False) try: listing = self.ls(path, detail=True, return_glob=True, **kwargs) except (FileNotFoundError, IOError): return [], [], [] for info in listing: # each info name must be at least [path]/part , but here # we check also for names like [path]/part/ pathname = info["name"].rstrip("/") name = pathname.rsplit("/", 1)[-1] if info["type"] == "directory" and pathname != path: # do not include "self" path full_dirs[pathname] = info dirs[name] = info elif pathname == path: # file-like with same name as give path files[""] = info else: files[name] = info if detail: yield path, dirs, files else: yield path, list(dirs), list(files) if maxdepth is not None: maxdepth -= 1 if maxdepth < 1: return for d in full_dirs: yield from self.walk(d, maxdepth=maxdepth, detail=detail, **kwargs) def mkdir(self, path, delimiter="/", exists_ok=False, **kwargs): """ Create directory entry at path Parameters ---------- path: str The path to create delimiter: str Delimiter to use when splitting the path exists_ok: bool If True, raise an exception if the directory already exists. Defaults to False """ container_name, path = self.split_path(path, delimiter=delimiter) if not exists_ok: if (container_name not in self.ls("")) and (not path): # create new container self.service_client.create_container(name=container_name) elif ( container_name in [container_path.split("/")[0] for container_path in self.ls("")] ) and path: ## attempt to create prefix container_client = self.service_client.get_container_client( container=container_name ) container_client.upload_blob(name=path, data="") else: ## everything else raise RuntimeError(f"Cannot create {container_name}{delimiter}{path}.") else: if container_name in self.ls("") and path: container_client = self.service_client.get_container_client( container=container_name ) container_client.upload_blob(name=path, data="") def rmdir(self, path: str, delimiter="/", **kwargs): """ Remove a directory, if empty Parameters ---------- path: str Path of directory to remove delimiter: str Delimiter to use when splitting the path """ container_name, path = self.split_path(path, delimiter=delimiter) if (container_name + delimiter in self.ls("")) and (not path): # delete container self.service_client.delete_container(container_name) def _rm(self, path, delimiter="/", **kwargs): """ Delete a given file Parameters ---------- path: str Path to file to delete delimiter: str Delimiter to use when splitting the path """ if self.isfile(path): container_name, path = self.split_path(path, delimiter=delimiter) container_client = self.service_client.get_container_client( container=container_name ) logging.debug(f"Delete blob {path} in {container_name}") container_client.delete_blob(path) elif self.isdir(path): container_name, path = self.split_path(path, delimiter=delimiter) container_client = self.service_client.get_container_client( container=container_name ) if (container_name + delimiter in self.ls("")) and (not path): logging.debug(f"Delete container {container_name}") container_client.delete_container(container_name) else: raise RuntimeError(f"cannot delete {path}") def _open( self, path: str, mode: str = "rb", block_size: int = None, autocommit: bool = True, cache_options=None, **kwargs, ): """Open a file on the datalake, or a block blob Parameters ---------- path: str Path to file to open mode: str What mode to open the file in - defaults to "rb" block_size: int Size per block for multi-part downloads. autocommit: bool Whether or not to write to the destination directly cache_type: str One of "readahead", "none", "mmap", "bytes", defaults to "readahead" Caching policy in read mode. See the definitions here: https://filesystem-spec.readthedocs.io/en/latest/api.html#readbuffering """ logging.debug(f"_open: {path}") return AzureBlobFile( fs=self, path=path, mode=mode, block_size=block_size or self.blocksize, autocommit=autocommit, cache_options=cache_options, **kwargs, )
account_name = 'lilablobssc' storage_account_url_blob = 'https://' + account_name + '.blob.core.windows.net' # Read-only storage_account_sas_token = '' storage_account_key = '' output_file = r'd:\temp\lila_sas_urls.txt' #%% Enumerate containers blob_service_client = BlobServiceClient(account_url=storage_account_url_blob, credential=storage_account_sas_token) container_iter = blob_service_client.list_containers(include_metadata=False) containers = [] for container in container_iter: containers.append(container) containers = [c['name'] for c in containers] #%% Generate SAS tokens permissions = ContainerSasPermissions(read=True, write=False, delete=False, list=True) expiry_time = datetime(year=2034, month=1, day=1) start_time = datetime(year=2020, month=1, day=1)
def main(req: func.HttpRequest) -> func.HttpResponse: logging.info('Python HTTP trigger function processed a request.') # DefaultAzureCredential supports managed identity or environment configuration (see docs) credential = DefaultAzureCredential() # parse parameters storage_account_source = os.environ["par_storage_account_name_source"] storage_account_source_url = "https://" + storage_account_source + ".blob.core.windows.net" storage_account_backup = os.environ["par_storage_account_name_backup"] storage_account_backup_url = "https://" + storage_account_backup + ".blob.core.windows.net" # create blob client for backup and source credential = DefaultAzureCredential() client_source = BlobServiceClient(account_url=storage_account_source_url, credential=credential) client_backup = BlobServiceClient(account_url=storage_account_backup_url, credential=credential) # Create queue clients queue_service = QueueService( account_name=os.environ['par_storage_account_name_queue'], account_key=os.environ['par_storage_account_key_queue']) queue_service.encode_function = QueueMessageFormat.text_base64encode # Get all blobs in sourcecontainer container_source_list = client_source.list_containers() for container in container_source_list: # Log container name logging.info(container.name) container_source = client_source.get_container_client(container.name) # Get all blobs in container prev_blob_name = "" prev_blob_etag = "" blob_source_list = container_source.list_blobs(include=['snapshots']) for blob in blob_source_list: if blob.snapshot == None: # Blob that is not snapshot. # 1. Check if snapshot needs to be created if prev_blob_name != blob.name: # New blob without snapshot, create snapshot/backup logging.info("new blob" + blob.name + ", create snapshot/backup") create_snapshot(client_source, queue_service, container.name, blob.name, blob.etag) elif prev_blob_etag != blob.etag: # Existing blob that has changed, create snapshot/backup logging.info(blob.name + "has changed, create snapshot/backup") create_snapshot(client_source, queue_service, container.name, blob.name, blob.etag) # 2. Check if incremental backup needs to be created # get blob backup and source properties blob_source = client_source.get_blob_client( container=container.name, blob=blob.name) source_last_modified = blob_source.get_blob_properties( )['last_modified'] source_etag = str( blob_source.get_blob_properties()['etag']).replace( "\"", "") blob_name_backup = append_timestamp_etag( blob.name, source_last_modified, source_etag) blob_backup = client_backup.get_blob_client( container=container.name + "bak", blob=blob_name_backup) blob_exists = check_blob_exists(blob_backup) # Check if blob exists if blob_exists == False: # Latest blob does not yet exist in backup, create message on queue to update queue_json = "{" + "\"container\":\"{}\", \"blob_name\":\"{}\", \"etag\":\"{}\"".format( container.name, blob.name, source_etag) + "}" logging.info("backup needed for: " + queue_json) queue_service.put_message(os.environ['par_queue_name'], queue_json) #asyncio.run(copy_adf_blob_source_backup(blob_source, blob_backup)) prev_blob_name = blob.name prev_blob_etag = blob.etag result = {"status": "ok"} return func.HttpResponse(str(result))
class AzureStorageHelper(object): def __init__(self, *args, **kwargs): if "stay_on_remote" in kwargs: del kwargs["stay_on_remote"] # if not handed down explicitely, try to read credentials from # environment variables. for (csavar, envvar) in [ ("account_url", "AZ_BLOB_ACCOUNT_URL"), ("credential", "AZ_BLOB_CREDENTIAL"), ]: if csavar not in kwargs and envvar in os.environ: kwargs[csavar] = os.environ.get(envvar) assert ( "account_url" in kwargs ), "Missing AZ_BLOB_ACCOUNT_URL env var (and possibly AZ_BLOB_CREDENTIAL)" # remove leading '?' from SAS if needed # if kwargs.get("sas_token", "").startswith("?"): # kwargs["sas_token"] = kwargs["sas_token"][1:] # by right only account_key or sas_token should be set, but we let # BlobServiceClient deal with the ambiguity self.blob_service_client = BlobServiceClient(**kwargs) def container_exists(self, container_name): return any( True for _ in self.blob_service_client.list_containers(container_name) ) def upload_to_azure_storage( self, container_name, file_path, blob_name=None, use_relative_path_for_blob_name=True, relative_start_dir=None, extra_args=None, ): """ Upload a file to Azure Storage This function uploads a file to an Azure Storage Container as a blob. Args: container_name: the name of the Azure container to use file_path: The path to the file to upload. blob_name: The name to set for the blob on Azure. If not specified, this will default to the name of the file. Returns: The blob_name of the file on Azure if written, None otherwise """ file_path = os.path.realpath(os.path.expanduser(file_path)) assert container_name, "container_name must be specified" assert os.path.exists(file_path), ( "The file path specified does not exist: %s" % file_path ) assert os.path.isfile(file_path), ( "The file path specified does not appear to be a file: %s" % file_path ) container_client = self.blob_service_client.get_container_client(container_name) try: container_client.create_container() except azure.core.exceptions.ResourceExistsError: pass if not blob_name: if use_relative_path_for_blob_name: if relative_start_dir: path_blob_name = os.path.relpath(file_path, relative_start_dir) else: path_blob_name = os.path.relpath(file_path) else: path_blob_name = os.path.basename(file_path) blob_name = path_blob_name blob_client = container_client.get_blob_client(blob_name) # upload_blob fails, if blob exists if self.exists_in_container(container_name, blob_name): blob_client.delete_blob() try: with open(file_path, "rb") as data: blob_client.upload_blob(data, blob_type="BlockBlob") return blob_client.get_blob_properties().name except Exception as e: raise WorkflowError("Error in creating blob. %s" % str(e)) # return None def download_from_azure_storage( self, container_name, blob_name, destination_path=None, expandBlobNameIntoDirs=True, make_dest_dirs=True, create_stub_only=False, ): """ Download a file from Azure Storage This function downloads an object from a specified Azure Storage container. Args: container_name: the name of the Azure Storage container to use (container name only) destination_path: If specified, the file will be saved to this path, otherwise cwd. expandBlobNameIntoDirs: Since Azure blob names can include slashes, if this is True (defult) then Azure blob names with slashes are expanded into directories on the receiving end. If it is False, the blob name is passed to os.path.basename() to get the substring following the last slash. make_dest_dirs: If this is True (default) and the destination path includes directories that do not exist, they will be created. Returns: The destination path of the downloaded file on the receiving end, or None if the destination_path could not be downloaded """ assert container_name, "container_name must be specified" assert blob_name, "blob_name must be specified" if destination_path: destination_path = os.path.realpath(os.path.expanduser(destination_path)) else: if expandBlobNameIntoDirs: destination_path = os.path.join(os.getcwd(), blob_name) else: destination_path = os.path.join( os.getcwd(), os.path.basename(blob_name) ) # if the destination path does not exist if make_dest_dirs: os.makedirs(os.path.dirname(destination_path), exist_ok=True) b = self.blob_service_client.get_blob_client(container_name, blob_name) if not create_stub_only: with open(destination_path, "wb") as my_blob: blob_data = b.download_blob() blob_data.readinto(my_blob) else: # just create an empty file with the right timestamps ts = b.get_blob_properties().last_modified.timestamp() with open(destination_path, "wb") as fp: os.utime(fp.name, (ts, ts)) return destination_path def delete_from_container(self, container_name, blob_name): """ Delete a file from Azure Storage container This function deletes an object from a specified Azure Storage container. Args: container_name: the name of the Azure Storage container to use (container name only, not endpoint) blob_name: the name of the blob to delete from the container Returns: nothing """ assert container_name, "container_name must be specified" assert blob_name, "blob_name must be specified" b = self.blob_service_client.get_blob_client(container_name, blob_name) b.delete_blob() def exists_in_container(self, container_name, blob_name): """ Returns whether the blob exists in the container Args: container_name: the name of the Azure Storage container (container name only, not endpoint) blob_name: the blob_name of the object to delete from the container Returns: True | False """ assert ( container_name ), 'container_name must be specified (did you try to write to "root" or forgot to set --default-remote-prefix?)' assert blob_name, "blob_name must be specified" cc = self.blob_service_client.get_container_client(container_name) return any(True for _ in cc.list_blobs(name_starts_with=blob_name)) def blob_size(self, container_name, blob_name): """ Returns the size of a blob Args: container_name: the name of the Azure Storage container (container name only, not endpoint) blob_name: the blob_name of the object to delete from the container Returns: Size in kb """ assert container_name, "container_name must be specified" assert blob_name, "blob_name must be specified" b = self.blob_service_client.get_blob_client(container_name, blob_name) return b.get_blob_properties().size // 1024 def blob_last_modified(self, container_name, blob_name): """ Returns a timestamp of a blob Args: container_name: the name of the Azure Storage container (container name only, not endpoint) blob_name: the blob_name of the object to delete from the container Returns: timestamp """ assert container_name, "container_name must be specified" assert blob_name, "blob_name must be specified" b = self.blob_service_client.get_blob_client(container_name, blob_name) return b.get_blob_properties().last_modified.timestamp() def list_blobs(self, container_name): """ Returns a list of blobs from the container Args: container_name: the name of the Azure Storage container (container name only, not endpoint) Returns: list of blobs """ assert container_name, "container_name must be specified" c = self.blob_service_client.get_container_client(container_name) return [b.name for b in c.list_blobs()]
answer = "" while answer not in ["y", "n"]: answer = input("OK to continue [Y/N]? ").lower() return answer == "y" #%% Create the clients source_blob_service_client = BlobServiceClient( account_url=source_account_url_blob, credential=source_sas_token) target_blob_service_client = BlobServiceClient( account_url=target_account_url_blob, credential=target_sas_token) #%% List source and destination containers source_container_iter = source_blob_service_client.list_containers( include_metadata=True) target_container_iter = target_blob_service_client.list_containers( include_metadata=True) source_containers = [] target_containers = [] print('Source containers:') for container in source_container_iter: source_containers.append(container) print(container['name'], container['metadata']) print('\nTarget containers:') for container in target_container_iter: target_containers.append(container) print(container['name'], container['metadata'])
class AzureClient(CloudClient): """ Implementation of a Azure Client using the Azure API """ def __init__(self, account_name=None, credential=None, auth_dict=None, *args, **kwargs): super().__init__(*args, **kwargs) if auth_dict: account_name = auth_dict.get("STORAGE_ACCOUNT_NAME") credential = auth_dict.get("STORAGE_ACCOUNT_KEY") if account_name and credential: self.account_name = account_name self.credential = credential self.secret = self.create_azure_secret() account_url = constants.AZURE_BLOB_ENDPOINT_TEMPLATE.format( account_name) self.blob_service_client = BlobServiceClient( account_url=account_url, credential=credential) def internal_create_uls(self, name, region): """ Creates the Underlying Storage using the Azure API Args: name (str): The Underlying Storage name to be created """ self.blob_service_client.get_container_client(name).create_container() def internal_delete_uls(self, name): """ Deletes the Underlying Storage using the Azure API Args: name (str): The Underlying Storage name to be deleted """ self.blob_service_client.get_container_client(name).delete_container() def get_all_uls_names(self): """ Returns a set containing all the container names that the client has access to """ return { container["name"] for container in self.blob_service_client.list_containers() } def verify_uls_exists(self, uls_name): """ Verifies whether a Underlying Storage with the given uls_name exists Args: uls_name (str): The Underlying Storage name to be verified Returns: bool: True if Underlying Storage exists, False otherwise """ try: self.blob_service_client.get_container_client( uls_name).get_container_properties() return True except ResourceNotFoundError: return False def create_azure_secret(self): """ Create a Kubernetes secret to allow NooBaa to create Azure-based backingstores """ bs_secret_data = templating.load_yaml( constants.MCG_BACKINGSTORE_SECRET_YAML) bs_secret_data["metadata"]["name"] = create_unique_resource_name( "cldmgr-azure", "secret") bs_secret_data["metadata"]["namespace"] = config.ENV_DATA[ "cluster_namespace"] bs_secret_data["data"]["AccountKey"] = base64.urlsafe_b64encode( self.credential.encode("UTF-8")).decode("ascii") bs_secret_data["data"]["AccountName"] = base64.urlsafe_b64encode( self.account_name.encode("UTF-8")).decode("ascii") return create_resource(**bs_secret_data)
class AzBlobManagerSync: """A utility class to help working with Azure Storage. This class implements synchronous methods based on the Microsoft Python SDK azure.storage.blob See: https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob?view=azure-python Avalaible: - Basic methods to work with containers and blobs """ def __init__(self, connection_string=None, account_url=None, credential=None): """Instantiate an asynchronous AzBlobManagerSync object. Args: connection_string (str): A connection string to an Azure Storage account. account_url (str): The URL to the blob storage account. Any other entities included in the URL path (e.g. container or blob) will be discarded. This URL can be optionally authenticated with a SAS token. credential (str): The credentials with which to authenticate. This is optional if the account URL already has a SAS token, or the connection string already has shared access key values. The value can be a SAS token string, an account shared access key, or an instance of a TokenCredentials class from azure.identity. Credentials provided here will take precedence over those in the connection string. Examples: Creating the AzBlobManagerSync with account url and a shared access key: azStorageManager = AzBlobManagerSync.create(account_url=self.url, credential=self.shared_access_key) Creating the AzBlobManagerSync with a connection string that has the shared access key: azStorageManager = AzBlobManagerSync.CREATE(onnection_string='DefaultEndpointsProtocol=http;...') """ self.connection_string = connection_string self.account_url = account_url self.credential = credential try: from azure.storage.blob import BlobServiceClient self.blob_service_client = BlobServiceClient if (self.connection_string is not None): # Create BlobServiceClient from a Connection String self.blob_service_client = BlobServiceClient.from_connection_string( conn_str=self.connection_string, credential=self.credential) else: # Create the BlobServiceClient with account url and credential. self.blob_service_client = BlobServiceClient( account_url=self.account_url, credential=self.credential) except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('') def _logAzureError(self, err=AzureError): msg = err.message.split('\n')[0] logger.error(f'AzureError error: {msg}') def create_container(self, container_name): """Creates a new container. Args: container_name (str): The name of the container. See https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata for naming convention Returns: bool: The return value. True for success, False otherwise. """ success = False try: new_container = self.blob_service_client.create_container( container_name) properties = new_container.get_container_properties() success = properties is not None and properties.name == container_name except ResourceExistsError: logger.info(f'Container \"{container_name}\" already exists.') except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('') return success def delete_container(self, container_name): """Deletes a container. Args: container_name (str): The name of the container. Returns: bool: The return value. True for success, False otherwise. """ success = False try: self.blob_service_client.delete_container(container_name) success = True except ResourceNotFoundError: logger.info(f'Container \"{container_name}\" doesn not exist.') except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('') return success def _list_containers(self, name_starts_with=None, include_metadata=False): """Lists containers. Args: name_starts_with (str): Filters the results to return only containers whose names begin with the specified prefix. include_metadata (bool): Specifies that container metadata to be returned in the response. Returns: ItemPaged[ContainerProperties]: An iterable (auto-paging) of ContainerProperties. """ try: containers = [] for container in self.blob_service_client.list_containers( name_starts_with=name_starts_with, include_metadata=include_metadata): containers.append(container) return containers except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('') return None def list_containers_name(self, name_starts_with=None): """Lists containers' name. Args: name_starts_with (str): Filters the results to return only containers whose names begin with the specified prefix. Returns: list: A list of strings representing the container names. """ containers_list = [] containers = self._list_containers( name_starts_with=name_starts_with, include_metadata=False) if (containers is None): return containers_list for container in containers: containers_list.append(container['name']) return containers_list def create_append_blob(self, container_name, blob_name, replace_blob=False): """Creates an append blob in an existing container. Args: container_name (str): The name of the container. blob_name (str): The name of the blob. replace_blob (bool): If True, deletes existing blob with same name Returns: bool: The return value. True for success, False otherwise. """ success = False try: blob_client = self.blob_service_client.get_blob_client( container_name, blob_name) # raise ResourceNotFoundError if blob does not exist blob_client.get_blob_properties() # blob exists already if (replace_blob is True): blob_client.create_append_blob() success = True except ResourceNotFoundError: blob_client.create_append_blob() success = True except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('') return success def create_page_blob(self, container_name, blob_name, size=1024, content_settings=None, metadata=None, premium_page_blob_tier=None): """Creates a page blob in an existing container. Args: container_name (str): The name of the container. blob_name (str): The name of the blob. size (int): This specifies the maximum size for the page blob, up to 1 TB. The page blob size must be aligned to a 512-byte boundary content_settings (ContentSettings): ContentSettings object used to set blob properties. Used to set content type, encoding, language, disposition, md5, and cache control. metadata (dict(str, str)): Name-value pairs associated with the blob as metadata premium_page_blob_tier (PremiumPageBlobTier): A page blob tier value to set the blob to Returns: bool: The return value. True for success, False otherwise. """ success = False try: blob_client = self.blob_service_client.get_blob_client( container_name, blob_name) blob_client.create_page_blob( size, content_settings, metadata, premium_page_blob_tier) success = True except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('') return success def delete_blob(self, container_name, blob_name): """Deletes a blob. Args: container_name (str): The name of the container. blob_name (str): The name of the blob. Returns: bool: The return value. True for success, False otherwise. """ success = False try: blob_client = self.blob_service_client.get_blob_client( container_name, blob_name) blob_client.delete_blob() success = True except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('') return success def list_blobs(self, container_name): """Lists the blobs in the specified container. Args: container_name (str): The name of the container. Returns: list: A list of strings representing the blob names. """ blobs_list = [] try: container_client = self.blob_service_client.get_container_client( container_name) for blob in container_client.list_blobs(): blobs_list.append(blob) except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception(f'Fatal error') return blobs_list def upload_data(self, data, container_name, blob_name, blob_type='BlockBlob'): """Creates a new blob from a data source with automatic chunking. Args: data: The blob data to upload. container_name (str): The name of the container. blob_name (str): The name of the blob. blob_typr (str): The type of the blob. This can be either BlockBlob, PageBlob or AppendBlob. Returns: bool: The return value. True for success, False otherwise. """ success = False try: blob_client = self.blob_service_client.get_blob_client( container_name, blob_name) blob_client.upload_blob(data) success = True except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('') return success def append_block(self, data, container_name, blob_name): """Commits a new block of data to the end of the existing append blob. Args: data: Content of the block. container_name (str): The name of the container. blob_name (str): The name of the blob. Returns: bool: The return value. True for success, False otherwise. """ success = False try: blob_client = self.blob_service_client.get_blob_client( container_name, blob_name) blob_client.append_block(data) success = True except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('') return success def download_data(self, container_name, blob_name): """Downloads a blob. Args: container_name (str): The name of the container. blob_name (str): The name of the blob. Returns: stream: The data stream """ try: blob_client = self.blob_service_client.get_blob_client( container_name, blob_name) stream = blob_client.download_blob() return stream.readall() except AzureError as err: self._logAzureError(err=err) except Exception: logger.exception('')
class Connector: def __init__(self, path=None, storage_account=None, container=None): logging.basicConfig(level=logging.INFO) self.storage_account = storage_account self.container = container if path: parsed_path = self.parse_azure_path(path) self.storage_account = parsed_path["storage_account"] self.container = parsed_path["container"] # Gets credential from azure cli self.credential = DefaultAzureCredential() # Create class wide storage account and container clients if names are passed if self.storage_account: blob_storage_url = self.get_blob_storage_url( storage_account=self.storage_account ) self.blob_service_client = BlobServiceClient( credential=self.credential, account_url=blob_storage_url ) if self.container: container_names = [ container.name for container in self.blob_service_client.list_containers() ] if self.container in container_names: self.container_client = ( self.blob_service_client.get_container_client( container=self.container ) ) else: raise ValueError( f"The container: {self.container} is not in the storage account: {self.storage_account}" ) @arguments_decorator() def get_blob_storage_url( self, path: str = None, storage_account: str = None, container: str = None, file_path: str = None, ) -> str: """ Returns the storage account url for the path or storage_account name passed :param path: str: optional An azure path. Defaults to None. :param storage_account: str: optional Storage account name. Defaults to None. :param container: str: optional Ignored. Defaults to None. :param file_path: str: optional Ignored. Defaults to None. :return str: The storage account url in the form: https://{storage_account}.blob.core.windows.net/ """ return f"https://{storage_account}.blob.core.windows.net/" def parse_azure_path(self, path: str) -> dict: """ Parse an azure url into : storage_account, container and filepath. If passing a url of the for azure://container/filepath the storage account is taken from the class instance. If there is no storage account passed for the class the storage account will be None. :param path: str: The azure blob path :return: dict: A dictionary containing the container name and filepath """ storage_account = self.storage_account container = self.container if path.startswith("https://"): storage_account = re.findall( r"https://(.*)\.blob\.core\.windows\.net", path )[0] path = path.replace(f"https://{storage_account}.blob.core.windows.net/", "") split_path = path.split("/") container = split_path.pop(0) filepath = "/".join(split_path) elif path.startswith("azure://"): path = path.replace("azure://", "") split_path = path.split("/") container = split_path.pop(0) filepath = "/".join(split_path) else: filepath = path return { "storage_account": storage_account, "container": container, "file_path": filepath, } def is_azure_path(self, path: str) -> bool: """ Returns true if the path is of a recognised azure path format :param path: str: The path to test :return bool: True if path is of an accepted azure path format """ patterns = [r"https://.*\.blob.core.windows.net", r"azure://"] return any([bool(re.match(p, path)) for p in patterns]) @arguments_decorator() def get_blob_service_client( self, path: str = None, storage_account: str = None, container: str = None, file_path: str = None, ) -> BlobServiceClient: """ Returns a blob service client for the specified storage account. If no parameters are passed the class values are used :param path: str: optional An azure path, the storage account will be used to create a client. Defaults to None. :param storage_account: str: optional The name of the storage account to create a client for. Defaults to None. :param container: str: optional Ignored. Defaults to None. :param file_path: str: optional Ignored. Defaults to None. :return BlobServiceClient: An azure blobserviceclient for the specified storage account """ if storage_account == self.storage_account: return self.blob_service_client else: blob_storage_url = self.get_blob_storage_url( storage_account=storage_account ) return BlobServiceClient( credential=self.credential, account_url=blob_storage_url ) @arguments_decorator() def get_container_client( self, path: str = None, storage_account: str = None, container: str = None, file_path: str = None, ) -> ContainerClient: """ Returns a container client when a container name in the storage account is passed. If no params are passed the class values will be used :param path: str: optional An Azure path, the container in the path will be used. Defaults to None. :param storage_account: str: optional A storage account name containing the container. Defaults to None. :param container: str: optional The name of the container to create a client for. Defaults to None. :param file_path: str: optional The file path will ultimately be ignored. Defaults to None. :exception ValueError: Raised if the container does not exist in the storage account :return ContainerClient: An Azure client for the container """ if storage_account == self.storage_account and container == self.container: return self.container_client else: client = self.get_blob_service_client(storage_account=storage_account) container_names = [container.name for container in client.list_containers()] if container in container_names: return client.get_container_client(container=container) else: raise ValueError( f"The container: {container} is not in the storage account: {storage_account}" ) @arguments_decorator() def list_blobs( self, path: str = None, storage_account: str = None, container: str = None, file_path: str = None, ) -> list: """ Returns a list of blobs, with paths that match the path passed :param path: str: optional An azure path to search for blobs. Defaults to None. :param storage_account: str: optional storage account name. Defaults to None. :param container: str: optional container name. Defaults to None. :param file_path: str: optional the prefix file path. Defaults to None. :return list: Blobs in the path passed """ container_client = self.get_container_client( storage_account=storage_account, container=container ) if file_path: blob_iter = container_client.list_blobs(name_starts_with=file_path) return [blob.name.replace(file_path, "") for blob in blob_iter] else: blob_iter = container_client.list_blobs() return [blob.name for blob in blob_iter] @multi_arguments_decorator(local_support=True) def download_folder( self, source_path: str = None, source_storage_account: str = None, source_container: str = None, source_file_path: str = None, dest_path: str = None, dest_storage_account: str = None, dest_container: str = None, dest_file_path: str = None, ): """ Copy a folder from azure to a local path :param source_path: str: optional An Azure path to the folder to download. Defaults to None. :param source_storage_account: str: optional The storage account name. Defaults to None. :param source_container: str: optional The container name. Defaults to None. :param source_file_path: str: optional The path to the folder to download. Defaults to None. :param dest_path: str: optional The local path to download the folder to. Defaults to None. :param dest_storage_account: str: optional Ignored. Defaults to None. :param dest_container: str: optional Ignored. Defaults to None. :param dest_file_path: str: optional Ignored. Defaults to None. :exception ValueError: Raised when destination path is an azure path """ container_client = self.get_container_client( storage_account=source_storage_account, container=source_container ) if self.is_azure_path(dest_path): raise ValueError( f"Expected destination to be local path got azure path: {dest_path}" ) os.makedirs(dest_path, exist_ok=True) for blob in container_client.list_blobs(source_file_path): file_name = os.path.basename(blob.name) local_path = os.path.join(dest_path, file_name) with open(local_path, "wb") as f: logging.info(f"Downloading {blob.name} to {local_path}") blob_data = container_client.download_blob(blob.name) blob_data.readinto(f) logging.info("Completed Download") @arguments_decorator() def blob_exists( self, path: str = None, storage_account: str = None, container: str = None, file_path: str = None, ): """ Checks if a file exists in azure, return bool :param path: str: optional Azure path to file to check. Defaults to None. :param storage_account: str: optional Storage account. Defaults to None. :param container: str: optional Container. Defaults to None. :param file_path: str: optional path to file. Defaults to None. :return [bool]: True if file exists """ client = self.get_blob_service_client(storage_account=storage_account) blob_client = client.get_blob_client(container, file_path) return blob_client.exists() @multi_arguments_decorator(local_support=True) def upload_folder( self, source_path: str = None, source_storage_account: str = None, source_container: str = None, source_file_path: str = None, dest_path: str = None, dest_storage_account: str = None, dest_container: str = None, dest_file_path: str = None, ): """ Upload a directory to an azure location. Subdirectories are not currently supported :param source_path: str: optional Local path to folder to upload. Defaults to None. :param source_storage_account: str: optional Ignored. Defaults to None. :param source_container: str: optional Ignored. Defaults to None. :param source_file_path: str: optional Ignored. Defaults to None. :param dest_path: str: optional Azure path to upload to. Defaults to None. :param dest_storage_account: str: optional Storage account. Defaults to None. :param dest_container: str: optional Container name. Defaults to None. :param dest_file_path: str: optional Path to folder. Defaults to None. :exception ValueError: Raised if source is an Azure path """ if self.is_azure_path(source_path): raise ValueError( f"Expected destination to be local path got azure path: {source_path}" ) container_client = self.get_container_client( storage_account=dest_storage_account, container=dest_container ) for root, dirs, files in os.walk(source_path): logging.warning( "upload folder does not support sub-directories only files will be uploaded" ) for file in files: file_path = os.path.join(root, file) blob_path = dest_file_path + file logging.info(f"Uploading {file_path} to {blob_path}") with open(file_path, "rb") as data: container_client.upload_blob(name=blob_path, data=data) @arguments_decorator(local_support=True) def open( self, path: str = None, storage_account: str = None, container: str = None, file_path: str = None, mode="r", *args, **kwargs, ): """ wrapper around smart_open so we dont have to pass a blob client everywhere. :param path: str: optional Local or azure path. Defaults to None. :param storage_account: str: optional name of storage account. Defaults to None. :param container: str: optional container name. Defaults to None. :param file_path: str: optional path to file. Defaults to None. :param mode: str: optional open mode. Defaults to "r". :return [smart_open.open]: Opens both local and azure files """ if path and not self.is_azure_path(path) and "w" in mode: # if it is local write mode, check the path and create folder if needed subdir = os.path.dirname(path) if subdir: os.makedirs(subdir, exist_ok=True) if storage_account: transport_params = { "client": self.get_blob_service_client(storage_account=storage_account) } else: transport_params = {"client": None} if "transport_params" not in kwargs: kwargs["transport_params"] = transport_params path = path if path else f"azure://{container}/{file_path}" return smart_open.open(path, mode, *args, **kwargs)
class AzureBlobStorage(object): """ Instantiate AzureBlobStorage Class for a given Azure storage account. `Args:` account_name: str The name of the Azure storage account to use. Not required if ``AZURE_ACCOUNT_NAME`` environment variable is set, or if ``account_url`` is supplied. credential: str An account shared access key with access to the Azure storage account, an SAS token string, or an instance of a TokenCredentials class. Not required if ``AZURE_CREDENTIAL`` environment variable is set. account_domain: str The domain of the Azure storage account, defaults to "blob.core.windows.net". Not required if ``AZURE_ACCOUNT_DOMAIN`` environment variable is set or if ``account_url`` is supplied. account_url: str The account URL for the Azure storage account including the account name and domain. Not required if ``AZURE_ACCOUNT_URL`` environment variable is set. `Returns:` `AzureBlobStorage` """ def __init__(self, account_name=None, credential=None, account_domain='blob.core.windows.net', account_url=None): self.account_url = os.getenv('AZURE_ACCOUNT_URL', account_url) self.credential = check_env.check('AZURE_CREDENTIAL', credential) if not self.account_url: self.account_name = check_env.check('AZURE_ACCOUNT_NAME', account_name) self.account_domain = check_env.check('AZURE_ACCOUNT_DOMAIN', account_domain) self.account_url = f'https://{self.account_name}.{self.account_domain}/' else: if not self.account_url.startswith('http'): self.account_url = f'https://{self.account_url}' # Update the account name and domain if a URL is supplied parsed_url = urlparse(self.account_url) self.account_name = parsed_url.netloc.split(".")[0] self.account_domain = ".".join(parsed_url.netloc.split(".")[1:]) self.client = BlobServiceClient(account_url=self.account_url, credential=self.credential) def list_containers(self): """ Returns a list of container names for the storage account `Returns:` list[str] List of container names """ container_names = [container.name for container in self.client.list_containers()] logger.info(f'Found {len(container_names)} containers.') return container_names def container_exists(self, container_name): """ Verify that a container exists within the storage account `Args:` container_name: str The name of the container `Returns:` bool """ container_client = self.get_container(container_name) try: container_client.get_container_properties() logger.info(f'{container_name} exists.') return True except ResourceNotFoundError: logger.info(f'{container_name} does not exist.') return False def get_container(self, container_name): """ Returns a container client `Args:` container_name: str The name of the container `Returns:` `ContainerClient` """ logger.info(f'Returning {container_name} container client') return self.client.get_container_client(container_name) def create_container(self, container_name, metadata=None, public_access=None, **kwargs): """ Create a container `Args:` container_name: str The name of the container metadata: Optional[dict[str, str]] A dict with metadata to associated with the container. public_access: Optional[Union[PublicAccess, str]] Settings for public access on the container, can be 'container' or 'blob' if not ``None`` kwargs: Additional arguments to be supplied to the Azure Blob Storage API. See `Azure Blob Storage SDK documentation <https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python#create-container-name--metadata-none--public-access-none----kwargs->`_ for more info. `Returns:` `ContainerClient` """ # noqa container_client = self.client.create_container( container_name, metadata=metadata, public_access=public_access, **kwargs ) logger.info(f'Created {container_name} container.') return container_client def delete_container(self, container_name): """ Delete a container. `Args:` container_name: str The name of the container `Returns:` ``None`` """ self.client.delete_container(container_name) logger.info(f'{container_name} container deleted.') def list_blobs(self, container_name, name_starts_with=None): """ List all of the names of blobs in a container `Args:` container_name: str The name of the container name_starts_with: Optional[str] A prefix to filter blob names `Returns:` list[str] A list of blob names """ container_client = self.get_container(container_name) blobs = [ blob for blob in container_client.list_blobs(name_starts_with=name_starts_with) ] logger.info(f'Found {len(blobs)} blobs in {container_name} container.') return blobs def blob_exists(self, container_name, blob_name): """ Verify that a blob exists in the specified container `Args:` container_name: str The container name blob_name: str The blob name `Returns:` bool """ blob_client = self.get_blob(container_name, blob_name) try: blob_client.get_blob_properties() logger.info(f'{blob_name} exists in {container_name} container.') return True except ResourceNotFoundError: logger.info(f'{blob_name} does not exist in {container_name} container.') return False def get_blob(self, container_name, blob_name): """ Get a blob object `Args:` container_name: str The container name blob_name: str The blob name `Returns:` `BlobClient` """ blob_client = self.client.get_blob_client(container_name, blob_name) logger.info(f'Got {blob_name} blob from {container_name} container.') return blob_client def get_blob_url(self, container_name, blob_name, account_key=None, permission=None, expiry=None, start=None): """ Get a URL with a shared access signature for a blob `Args:` container_name: str The container name blob_name: str The blob name account_key: Optional[str] An account shared access key for the storage account. Will default to the key used on initialization if one was provided as the credential, but required if it was not. permission: Optional[Union[BlobSasPermissions, str]] Permissions associated with the blob URL. Can be either a BlobSasPermissions object or a string where 'r', 'a', 'c', 'w', and 'd' correspond to read, add, create, write, and delete permissions respectively. expiry: Optional[Union[datetime, str]] The datetime when the URL should expire. Defaults to UTC. start: Optional[Union[datetime, str]] The datetime when the URL should become valid. Defaults to UTC. If it is ``None``, the URL becomes active when it is first created. `Returns:` str URL with shared access signature for blob """ if not account_key: if not self.credential: raise ValueError( 'An account shared access key must be provided if it was not on initialization' ) account_key = self.credential sas = generate_blob_sas( self.account_name, container_name, blob_name, account_key=account_key, permission=permission, expiry=expiry, start=start, ) return f'{self.account_url}/{container_name}/{blob_name}?sas={sas}' def _get_content_settings_from_dict(self, kwargs_dict): """ Removes any keys for ``ContentSettings`` from a dict and returns a tuple of the generated settings or ``None`` and a dict with the settings keys removed. `Args:` kwargs_dict: dict A dict which should be processed and may have keys for ``ContentSettings`` `Returns:` Tuple[Optional[ContentSettings], dict] Any created settings or ``None`` and the dict with settings keys remvoed """ kwargs_copy = {**kwargs_dict} content_settings = None content_settings_dict = {} content_settings_keys = [ 'content_type', 'content_encoding', 'content_language', 'content_disposition', 'cache_control', 'content_md5' ] kwarg_keys = list(kwargs_copy.keys()) for key in kwarg_keys: if key in content_settings_keys: content_settings_dict[key] = kwargs_copy.pop(key) if content_settings_dict: content_settings = ContentSettings(**content_settings_dict) return content_settings, kwargs_copy def put_blob(self, container_name, blob_name, local_path, **kwargs): """ Puts a blob (aka file) in a bucket `Args:` container_name: str The name of the container to store the blob blob_name: str The name of the blob to be stored local_path: str The local path of the file to upload kwargs: Additional arguments to be supplied to the Azure Blob Storage API. See `Azure Blob Storage SDK documentation <https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobclient?view=azure-python#upload-blob-data--blob-type--blobtype-blockblob---blockblob----length-none--metadata-none----kwargs->`_ for more info. Any keys that belong to the ``ContentSettings`` object will be provided to that class directly. `Returns:` `BlobClient` """ # noqa blob_client = self.get_blob(container_name, blob_name) # Move all content_settings keys into a ContentSettings object content_settings, kwargs_dict = self._get_content_settings_from_dict(kwargs) with open(local_path, 'rb') as f: data = f.read() blob_client = blob_client.upload_blob( data, overwrite=True, content_settings=content_settings, **kwargs_dict, ) logger.info(f'{blob_name} blob put in {container_name} container') # Return refreshed BlobClient object return self.get_blob(container_name, blob_name) def download_blob(self, container_name, blob_name, local_path=None): """ Downloads a blob from a container into the specified file path or a temporary file path `Args:` container_name: str The container name blob_name: str The blob name local_path: Optional[str] The local path where the file will be downloaded. If not specified, a temporary file will be created and returned, and that file will be removed automatically when the script is done running. `Returns:` str The path of the downloaded file """ if not local_path: local_path = files.create_temp_file_for_path('TEMPFILEAZURE') blob_client = self.get_blob(container_name, blob_name) logger.info(f'Downloading {blob_name} blob from {container_name} container.') with open(local_path, 'wb') as f: blob_client.download_blob().readinto(f) logger.info(f'{blob_name} blob saved to {local_path}.') return local_path def delete_blob(self, container_name, blob_name): """ Delete a blob in a specified container. `Args:` container_name: str The container name blob_name: str The blob name `Returns:` ``None`` """ blob_client = self.get_blob(container_name, blob_name) blob_client.delete_blob() logger.info(f'{blob_name} blob in {container_name} container deleted.') def upload_table(self, table, container_name, blob_name, data_type='csv', **kwargs): """ Load the data from a Parsons table into a blob. `Args:` table: obj A :ref:`parsons-table` container_name: str The container name to upload the data into blob_name: str The blob name to upload the data into data_type: str The file format to use when writing the data. One of: `csv` or `json` kwargs: Additional keyword arguments to supply to ``put_blob`` `Returns:` `BlobClient` """ if data_type == 'csv': local_path = table.to_csv() content_type = 'text/csv' elif data_type == 'json': local_path = table.to_json() content_type = 'application/json' else: raise ValueError(f'Unknown data_type value ({data_type}): must be one of: csv or json') return self.put_blob( container_name, blob_name, local_path, content_type=content_type, **kwargs )