def __readFileFromDataLake(self, blobPath):
        # Todo: Put in config file
        storage_account_name = self.config["datalakeInfo"][
            "storageAccountName"]
        file_system_name = self.config["datalakeInfo"]["fileSystemName"]
        client_id = self.config["datalakeInfo"]["clientId"]
        tenant_id = self.config["datalakeInfo"]["tenantId"]
        client_secret = self.config["datalakeInfo"]["clientSecret"]

        try:
            credential = azure.identity.ClientSecretCredential(
                tenant_id, client_id, client_secret)

            service_client = DataLakeServiceClient(
                account_url="{}://{}.dfs.core.windows.net".format(
                    "https", storage_account_name),
                credential=credential)

            file_system_client = service_client.get_file_system_client(
                file_system_name)

            file_client = file_system_client.get_file_client(blobPath)

            downloaded = file_client.download_file()
            downloaded_bytes = downloaded.readall()

            return downloaded_bytes

        except Exception as e:
            print(e)
    def test_restore_file_system_with_sas(self, datalake_storage_account_name, datalake_storage_account_key):
        pytest.skip(
            "We are generating a SAS token therefore play only live but we also need a soft delete enabled account.")
        self._setUp(datalake_storage_account_name, datalake_storage_account_key)
        token = generate_account_sas(
            self.dsc.account_name,
            self.dsc.credential.account_key,
            ResourceTypes(service=True, file_system=True),
            AccountSasPermissions(read=True, write=True, list=True, delete=True),
            datetime.utcnow() + timedelta(hours=1),
        )
        dsc = DataLakeServiceClient(self.dsc.url, token)
        name = self._get_file_system_reference(prefix="filesystem")
        filesystem_client = dsc.create_file_system(name)
        filesystem_client.delete_file_system()
        # to make sure the filesystem is deleted
        with self.assertRaises(ResourceNotFoundError):
            filesystem_client.get_file_system_properties()

        filesystem_list = list(dsc.list_file_systems(include_deleted=True))
        self.assertTrue(len(filesystem_list) >= 1)

        restored_version = 0
        for filesystem in filesystem_list:
            # find the deleted filesystem and restore it
            if filesystem.deleted and filesystem.name == filesystem_client.file_system_name:
                restored_fs_client = dsc.undelete_file_system(filesystem.name, filesystem.deleted_version,
                                                              new_name="restored" + name + str(restored_version))
                restored_version += 1

                # to make sure the deleted filesystem is restored
                props = restored_fs_client.get_file_system_properties()
                self.assertIsNotNone(props)
Пример #3
0
    def setUp(self):
        super(LargeFileTest, self).setUp()
        url = self._get_account_url()
        self.payload_dropping_policy = PayloadDroppingPolicy()
        credential_policy = _format_shared_key_credential(
            self.settings.STORAGE_DATA_LAKE_ACCOUNT_NAME,
            self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY)
        self.dsc = DataLakeServiceClient(
            url,
            credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY,
            logging_enable=True,
            _additional_pipeline_policies=[
                self.payload_dropping_policy, credential_policy
            ])
        self.config = self.dsc._config

        self.file_system_name = self.get_resource_name('filesystem')

        if not self.is_playback():
            file_system = self.dsc.get_file_system_client(
                self.file_system_name)
            try:
                file_system.create_file_system(timeout=5)
            except ResourceExistsError:
                pass
Пример #4
0
 def setUp(self):
     super(FileSystemTest, self).setUp()
     url = self._get_account_url()
     self.dsc = DataLakeServiceClient(
         url, credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY)
     self.config = self.dsc._config
     self.test_file_systems = []
Пример #5
0
    def test_read_file_with_user_delegation_key(self):
        # SAS URL is calculated from storage key, so this test runs live only
        if TestMode.need_recording_file(self.test_mode):
            return

        # Create file
        file_client = self._create_file_and_return_client()
        data = self.get_random_bytes(1024)
        # Upload data to file
        file_client.append_data(data, 0, len(data))
        file_client.flush_data(len(data))

        # Get user delegation key
        token_credential = self.generate_oauth_token()
        service_client = DataLakeServiceClient(self._get_oauth_account_url(), credential=token_credential)
        user_delegation_key = service_client.get_user_delegation_key(datetime.utcnow(),
                                                                     datetime.utcnow() + timedelta(hours=1))

        sas_token = generate_file_sas(file_client.account_name,
                                      file_client.file_system_name,
                                      None,
                                      file_client.path_name,
                                      user_delegation_key,
                                      permission=FileSasPermissions(read=True, create=True, write=True, delete=True),
                                      expiry=datetime.utcnow() + timedelta(hours=1),
                                      )

        # doanload the data and make sure it is the same as uploaded data
        new_file_client = DataLakeFileClient(self._get_account_url(),
                                             file_client.file_system_name,
                                             file_client.path_name,
                                             credential=sas_token)
        downloaded_data = new_file_client.download_file().readall()
        self.assertEqual(data, downloaded_data)
Пример #6
0
 def __init__(self):
     url = self.datalake_account_url()
     key = self.datalake_account_key()
     print('url: {}'.format(url))
     print('key: {}'.format(key))
     self.service_client = DataLakeServiceClient(account_url=url,
                                                 credential=key)
     print(self.service_client)
Пример #7
0
 def connect_to_adls(acct_name, acct_key, fs):
     try:
         global service_client
         
         service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
             "https", acct_name), credential=acct_key)
         file_system_client = service_client.get_file_system_client(file_system=fs)
         return file_system_client
     except Exception as e:
         print(e)
         return e
    def _setUp(self, account_name, account_key):
        url = self._get_account_url(account_name)
        self.dsc = DataLakeServiceClient(url, credential=account_key, logging_enable=True)
        self.config = self.dsc._config
        self.filesystem_name = self.get_resource_name('utqqcontainer')

        if not self.is_playback():
            try:
                self.dsc.create_file_system(self.filesystem_name)
            except:
                pass
    def setUp(self):
        super(StorageQuickQueryTest, self).setUp()
        url = self._get_account_url()
        self.dsc = DataLakeServiceClient(url, credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY, logging_enable=True)
        self.config = self.dsc._config
        self.filesystem_name = self.get_resource_name('utqqcontainer')

        if not self.is_playback():
            try:
                self.dsc.create_file_system(self.filesystem_name)
            except:
                pass
Пример #10
0
 def __init__(self, container_url, path):
     storage_account, file_system_name = get_details_from_container_url(
         container_url)
     self.container_url = container_url
     self.path = path
     self.storage_account = storage_account
     self.file_system_name = file_system_name
     storage_config = AzStorageConfig.objects.get(
         storage_account=storage_account, container_name=file_system_name)
     self.service = DataLakeServiceClient(
         f"https://{storage_account}.dfs.core.windows.net/",
         credential=storage_config.storage_account_key)
     self.directory_client = self.service.get_directory_client(
         file_system_name, path)
Пример #11
0
    def setUp(self):
        super(FileTest, self).setUp()
        url = self._get_account_url()
        self.dsc = DataLakeServiceClient(url, credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY)
        self.config = self.dsc._config

        self.file_system_name = self.get_resource_name('filesystem')

        if not self.is_playback():
            file_system = self.dsc.get_file_system_client(self.file_system_name)
            try:
                file_system.create_file_system(timeout=5)
            except ResourceExistsError:
                pass
    def create_connection(self, storage_account_name, storage_account_key,
                          container):
        try:
            dfs_url = "{}://{}.dfs.core.windows.net".format(
                "https", storage_account_name)
            blob_url = "{}://{}.blob.core.windows.net/".format(
                "https", storage_account_name)
            # print("ADLS URL:", dfs_url)
            # print("Blob URL:", blob_url)
            self.service_client = DataLakeServiceClient(
                account_url=dfs_url, credential=storage_account_key)
            # print("Getting file_system_client...")
            self.file_system_client = self.service_client.get_file_system_client(
                file_system=self.settings.storage_container)
            # print("Getting blob_service_client...")
            connect_string="DefaultEndpointsProtocol=https;AccountName=" + storage_account_name + ";AccountKey="\
                           + storage_account_key + ";EndpointSuffix=core.windows.net"
            self.blob_service_client = BlobServiceClient.from_connection_string(
                conn_str=connect_string)
            # Create sas token for blob

            self.sas_token = generate_account_sas(
                account_name=self.blob_service_client.account_name,
                account_key=storage_account_key,
                resource_types=ResourceTypes(service=True,
                                             object=True,
                                             container=True),
                permission=AccountSasPermissions(read=True,
                                                 write=True,
                                                 delete=True,
                                                 list=True,
                                                 add=True,
                                                 create=True),
                start=datetime.now() - timedelta(hours=1),
                expiry=datetime.utcnow() +
                timedelta(hours=4)  # Token valid for 4 hours
            )
            self.container_client = self.blob_service_client.get_container_client(
                container)
            # print("returning references.")
            return self.service_client\
                , self.file_system_client\
                , self.blob_service_client\
                , self.container_client\
                , self.sas_token

        except Exception as e:
            print(e)
            return None, None, None, None, None
Пример #13
0
    def __init__(self,
                 connection_string=os.getenv("AZURE_DT_2"),
                 container_name_="container06"):
        account_name = os.getenv('STORAGE_ACCOUNT_NAME', "")
        account_key = os.getenv('STORAGE_ACCOUNT_KEY', "")

        # set up the service client with the credentials from the environment variables
        self.service_client = DataLakeServiceClient(
            account_url="{}://{}.dfs.core.windows.net".format(
                "https", account_name),
            credential=account_key)
        self.file_system_name = container_name_
        self.file_name = file_name_
        self.dict_inh = {}
        self.dict_of_directory = {}
Пример #14
0
    def test_preauthorize_user_with_user_delegation_key(
            self, datalake_storage_account_name, datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # SAS URL is calculated from storage key, so this test runs live only

        # Create file
        file_client = self._create_file_and_return_client()
        data = self.get_random_bytes(1024)
        # Upload data to file
        file_client.append_data(data, 0, len(data))
        file_client.flush_data(len(data))
        file_client.set_access_control(
            owner="68390a19-a643-458b-b726-408abf67b4fc", permissions='0777')
        acl = file_client.get_access_control()

        # Get user delegation key
        token_credential = self.generate_oauth_token()
        service_client = DataLakeServiceClient(
            self._get_account_url(datalake_storage_account_name),
            credential=token_credential)
        user_delegation_key = service_client.get_user_delegation_key(
            datetime.utcnow(),
            datetime.utcnow() + timedelta(hours=1))

        sas_token = generate_file_sas(
            file_client.account_name,
            file_client.file_system_name,
            None,
            file_client.path_name,
            user_delegation_key,
            permission=FileSasPermissions(read=True,
                                          write=True,
                                          manage_access_control=True,
                                          manage_ownership=True),
            expiry=datetime.utcnow() + timedelta(hours=1),
            preauthorized_agent_object_id="68390a19-a643-458b-b726-408abf67b4fc"
        )

        # doanload the data and make sure it is the same as uploaded data
        new_file_client = DataLakeFileClient(
            self._get_account_url(datalake_storage_account_name),
            file_client.file_system_name,
            file_client.path_name,
            credential=sas_token)

        acl = new_file_client.set_access_control(permissions='0777')
        self.assertIsNotNone(acl)
    def test_list_system_filesystems(self, datalake_storage_account_name,
                                     datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # Arrange
        dsc = DataLakeServiceClient(self.dsc.url,
                                    credential=datalake_storage_account_key)
        # Act
        filesystems = list(dsc.list_file_systems(include_system=True))

        # Assert
        found = False
        for fs in filesystems:
            if fs.name == "$logs":
                found = True
        self.assertEqual(found, True)
    def file_system_sample(self):

        # [START create_file_system_client_from_service]
        # Instantiate a DataLakeServiceClient using a connection string
        from azure.storage.filedatalake import DataLakeServiceClient
        datalake_service_client = DataLakeServiceClient.from_connection_string(
            self.connection_string)

        # Instantiate a FileSystemClient
        file_system_client = datalake_service_client.get_file_system_client(
            "mynewfilesystem")
        # [END create_file_system_client_from_service]

        try:
            # [START create_file_system]
            file_system_client.create_file_system()
            # [END create_file_system]

            # [START get_file_system_properties]
            properties = file_system_client.get_file_system_properties()
            # [END get_file_system_properties]

        finally:
            # [START delete_file_system]
            file_system_client.delete_file_system()
Пример #17
0
    def _setUp(self, account_name, account_key):
        url = self._get_account_url(account_name)
        self.dsc = DataLakeServiceClient(url,
                                         credential=account_key,
                                         logging_enable=True)
        self.config = self.dsc._config

        self.file_system_name = self.get_resource_name('filesystem')

        if not self.is_playback():
            file_system = self.dsc.get_file_system_client(
                self.file_system_name)
            try:
                file_system.create_file_system(timeout=5)
            except ResourceExistsError:
                pass
    def set_metadata_on_file_system(self):

        # Instantiate a DataLakeServiceClient using a connection string
        from azure.storage.filedatalake import DataLakeServiceClient
        datalake_service_client = DataLakeServiceClient.from_connection_string(
            self.connection_string)

        # Instantiate a FileSystemClient
        file_system_client = datalake_service_client.get_file_system_client(
            "mymetadatafilesystemsync")

        try:
            # Create new File System
            file_system_client.create_file_system()

            # [START set_file_system_metadata]
            # Create key, value pairs for metadata
            metadata = {'type': 'test'}

            # Set metadata on the file system
            file_system_client.set_file_system_metadata(metadata=metadata)
            # [END set_file_system_metadata]

            # Get file system properties
            properties = file_system_client.get_file_system_properties()

        finally:
            # Delete file system
            file_system_client.delete_file_system()
 def get_file_system_client(self):
     connect_str = os.environ["ADLS_CONNECTION_STRING"]
     service_client = DataLakeServiceClient.from_connection_string(
         connect_str)
     file_system_client = service_client.get_file_system_client(
         file_system=self.file_system_name)
     return file_system_client
    def get_directory_client_from_file_system(self):

        # Instantiate a DataLakeServiceClient using a connection string
        from azure.storage.filedatalake import DataLakeServiceClient
        datalake_service_client = DataLakeServiceClient.from_connection_string(
            self.connection_string)

        # Instantiate a FileSystemClient
        file_system_client = datalake_service_client.get_file_system_client(
            "myfilesystem")

        # Create new File System
        try:
            file_system_client.create_file_system()
        except ResourceExistsError:
            pass

        # [START get_directory_client_from_file_system]
        # Get the DataLakeDirectoryClient from the FileSystemClient to interact with a specific file
        directory_client = file_system_client.get_directory_client(
            "mynewdirectory")
        # [END get_directory_client_from_file_system]

        # Delete file system
        file_system_client.delete_file_system()
    def acquire_lease_on_file_system(self):

        # Instantiate a DataLakeServiceClient using a connection string
        # [START create_data_lake_service_client_from_conn_str]
        from azure.storage.filedatalake import DataLakeServiceClient
        datalake_service_client = DataLakeServiceClient.from_connection_string(
            self.connection_string)
        # [END create_data_lake_service_client_from_conn_str]

        # Instantiate a FileSystemClient
        file_system_client = datalake_service_client.get_file_system_client(
            "myleasefilesystem")

        # Create new File System
        try:
            file_system_client.create_file_system()
        except ResourceExistsError:
            pass

        # [START acquire_lease_on_file_system]
        # Acquire a lease on the file system
        lease = file_system_client.acquire_lease()

        # Delete file system by passing in the lease
        file_system_client.delete_file_system(lease=lease)
    def list_paths_in_file_system(self):

        # Instantiate a DataLakeServiceClient using a connection string
        from azure.storage.filedatalake import DataLakeServiceClient
        datalake_service_client = DataLakeServiceClient.from_connection_string(
            self.connection_string)

        # Instantiate a FileSystemClient
        file_system_client = datalake_service_client.get_file_system_client(
            "myfilesystemforlistpaths")

        # Create new File System
        file_system_client.create_file_system()

        # [START upload_file_to_file_system]
        with open(SOURCE_FILE, "rb") as data:
            file_client = file_system_client.get_file_client("myfile")
            file_client.create_file()
            file_client.append_data(data, 0)
            file_client.flush_data(data.tell())
        # [END upload_file_to_file_system]

        # [START get_paths_in_file_system]
        path_list = file_system_client.get_paths()
        for path in path_list:
            print(path.name + '\n')
        # [END get_paths_in_file_system]

        # Delete file system
        file_system_client.delete_file_system()
Пример #23
0
def initialize_adls(storage_account, client_id, client_secret, tenant_id):
    """Initialize the connection to an Azure Data Lake Gen 2

    Args:
        storage_account (string): The storage account name
        client_id (string): The service principal client id
        client_secret (string): The service principal client secret
        tenant_id (string): The azure tenant id

    Returns:
        [DataLakeServiceClient]: An Azure Data Lake Gen 2 client
    """
    try:

        credential = ClientSecretCredential(tenant_id, client_id,
                                            client_secret)

        service_client = DataLakeServiceClient(
            account_url="{}://{}.dfs.core.windows.net".format(
                "https", storage_account),
            credential=credential)

        return service_client

    except Exception as e:
        print(e)
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        datalake_service_client = DataLakeServiceClient.from_connection_string(
            self.connection_string)
        self.file_system_client = datalake_service_client.get_file_system_client(
            self.container_name)
Пример #25
0
    def __init__(self):
        if os.environ.get('AZURE_SUBSCRIPTION_ID') is not None:
            self.subscription_id = os.environ.get(
                'AZURE_SUBSCRIPTION_ID')  # your Azure Subscription Id
        else:
            raise ValueError(
                'AZURE_SUBSCRIPTION_ID environment variable missing')

        # Sanity check
        if os.environ.get('AZURE_CLIENT_ID') is None:
            raise ValueError('AZURE_CLIENT_ID environment variable missing')
        if os.environ.get('AZURE_CLIENT_SECRET') is None:
            raise ValueError(
                'AZURE_CLIENT_SECRET environment variable missing')
        if os.environ.get('AZURE_TENANT_ID') is None:
            raise ValueError('AZURE_TENANT_ID environment variable missing')
        if os.environ.get('STORAGE_ACCOUNT_NAME') is None:
            raise ValueError(
                'STORAGE_ACCOUNT_NAME environment variable missing')

        self.storage_account_name = os.environ['STORAGE_ACCOUNT_NAME']

        self.credentials = ServicePrincipalCredentials(
            client_id=os.environ['AZURE_CLIENT_ID'],
            secret=os.environ['AZURE_CLIENT_SECRET'],
            tenant=os.environ['AZURE_TENANT_ID'])

        #FIXME do we need two credentials?
        self.client_secret_credential = ClientSecretCredential(
            os.environ['AZURE_TENANT_ID'], os.environ['AZURE_CLIENT_ID'],
            os.environ['AZURE_CLIENT_SECRET'])

        self.resource_client = ResourceManagementClient(
            self.credentials, self.subscription_id)
        self.msi_client = ManagedServiceIdentityClient(self.credentials,
                                                       self.subscription_id)
        self.policy_client = PolicyClient(self.credentials,
                                          self.subscription_id)
        self.authorization_client = AuthorizationManagementClient(
            self.credentials, self.subscription_id)
        self.storage_client = StorageManagementClient(self.credentials,
                                                      self.subscription_id)
        # adls2 storage client
        self.adls2_client = DataLakeServiceClient(
            account_url="{}://{}.dfs.core.windows.net".format(
                "https", self.storage_account_name),
            credential=self.client_secret_credential)
Пример #26
0
    def test_set_acl_with_user_delegation_key(self,
                                              datalake_storage_account_name,
                                              datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # SAS URL is calculated from storage key, so this test runs live only

        # Create file
        file_client = self._create_file_and_return_client()
        data = self.get_random_bytes(1024)
        # Upload data to file
        file_client.append_data(data, 0, len(data))
        file_client.flush_data(len(data))

        # Get user delegation key
        token_credential = self.generate_oauth_token()
        service_client = DataLakeServiceClient(
            self._get_account_url(datalake_storage_account_name),
            credential=token_credential)
        user_delegation_key = service_client.get_user_delegation_key(
            datetime.utcnow(),
            datetime.utcnow() + timedelta(hours=1))

        sas_token = generate_file_sas(
            file_client.account_name,
            file_client.file_system_name,
            None,
            file_client.path_name,
            user_delegation_key,
            permission=FileSasPermissions(execute=True,
                                          manage_access_control=True,
                                          manage_ownership=True),
            expiry=datetime.utcnow() + timedelta(hours=1),
        )

        # doanload the data and make sure it is the same as uploaded data
        new_file_client = DataLakeFileClient(
            self._get_account_url(datalake_storage_account_name),
            file_client.file_system_name,
            file_client.path_name,
            credential=sas_token)
        acl = 'user::rwx,group::r-x,other::rwx'
        owner = "dc140949-53b7-44af-b1e9-cd994951fb86"
        new_file_client.set_access_control(acl=acl, owner=owner)
        access_control = new_file_client.get_access_control()
        self.assertEqual(acl, access_control['acl'])
        self.assertEqual(owner, access_control['owner'])
Пример #27
0
def initialize_storage_account(storage_account_name, storage_account_key):
    try:
        global service_client
        service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
            "https", storage_account_name), credential=storage_account_key)
        #print(service_client)
    except Exception as e:
        print(e)
Пример #28
0
def create_datalake_service_client() -> DataLakeServiceClient:
    """Return DataLake Service Client."""
    account_name = DL.STORAGE_ACCOUNT_NAME
    credential = DL.STORAGE_ACCOUNT_KEY
    account_url = f"https://{account_name}.dfs.core.windows.net/"
    datalake_service = DataLakeServiceClient(account_url=account_url,
                                             credential=credential)
    return datalake_service
Пример #29
0
    def inner_transfer(self, directory, token):
        basename = "upload"
        suffix = datetime.now().strftime("%y%m%d_%H%M%S")
        filename = "_".join([basename, suffix]) 
        data = self.get_random_bytes(200*1024)
        upload_path = "{}/{}".format(directory, filename)

        print("\nUploading {} to directory: {}".format(filename, directory))

        service_client = DataLakeServiceClient(self.STORAGE_URL, credential=token)
        file_client = service_client.get_file_client(self.STORAGE_FILESYSTEM, upload_path)
        file_client.upload_data(data, overwrite=True, max_concurrency=3)

        print("Upload complete. Re-downloading file...")

        downloaded_data = file_client.download_file().readall()
        print("Downloaded file. Bytes read: {}".format(len(downloaded_data)))
Пример #30
0
 def __missing__(self, key):
     endpoint, fs_name = key
     if endpoint not in self._service_clients:
         self._service_clients[endpoint] = DataLakeServiceClient(
             endpoint, credential=storage_account_key
         )
     fs_client = self._service_clients[endpoint].get_file_system_client(fs_name)
     self[key] = fs_client
     return fs_client