def __get_file_paths(
            self, max_files: int,
            local_client_auth: ClientAuthorization) -> List[PathProperties]:
        with FileSystemClient(self.account_url,
                              self.filesystem_name,
                              credential=local_client_auth.get_credential_sync(
                              )) as filesystem_client:

            return self.__get_paths_since_last_run(filesystem_client,
                                                   max_files)
Exemplo n.º 2
0
    def test_file_sas_only_applies_to_file_level(self,
                                                 datalake_storage_account_name,
                                                 datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # SAS URL is calculated from storage key, so this test runs live only
        file_name = self._get_file_reference()
        directory_name = self._get_directory_reference()
        self._create_file_and_return_client(directory=directory_name,
                                            file=file_name)

        # generate a token with file level read and write permissions
        token = generate_file_sas(
            self.dsc.account_name,
            self.file_system_name,
            directory_name,
            file_name,
            self.dsc.credential.account_key,
            permission=FileSasPermissions(read=True, write=True),
            expiry=datetime.utcnow() + timedelta(hours=1),
        )

        # read the created file which is under root directory
        file_client = DataLakeFileClient(self.dsc.url,
                                         self.file_system_name,
                                         directory_name + '/' + file_name,
                                         credential=token)
        properties = file_client.get_file_properties()

        # make sure we can read the file properties
        self.assertIsNotNone(properties)

        # try to write to the created file with the token
        response = file_client.append_data(b"abcd",
                                           0,
                                           4,
                                           validate_content=True)
        self.assertIsNotNone(response)

        # the token is for file level, so users are not supposed to have access to file system level operations
        file_system_client = FileSystemClient(self.dsc.url,
                                              self.file_system_name,
                                              credential=token)
        with self.assertRaises(ClientAuthenticationError):
            file_system_client.get_file_system_properties()

        # the token is for file level, so users are not supposed to have access to directory level operations
        directory_client = DataLakeDirectoryClient(self.dsc.url,
                                                   self.file_system_name,
                                                   directory_name,
                                                   credential=token)
        with self.assertRaises(ClientAuthenticationError):
            directory_client.get_directory_properties()
 def get_conn(self) -> FileSystemClient:
     """
     Return an Azure Data Lake Service Client object.
     :return: FileSystemClient
     """
     conn = self.get_connection(self.conn_id)
     file_system_client = FileSystemClient(
         account_url=f"https://{conn.login}.dfs.core.windows.net",
         file_system_name=self.container,
         credential=conn.password,
     )
     return file_system_client
Exemplo n.º 4
0
    def __init__(self):
        """
        [Constructor for Azure Data Lake Gen2 class, which instantiates a connection to an
        Azure storage account with Data Lake Gen2 mounted as the container]

        Arguments:
            dbutils {object} -- [Represents databricks utilities library for pyspark]
            client {object} -- [Represents the azure key vault client used to retrieve secrets]
        """
           
        self.account_name = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-account')
        self.account_key = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-key')
        self.account_url = "https://{0}.dfs.core.windows.net/".format(self.account_name)
        self.service_client = DataLakeServiceClient(account_url=self.account_url, credential=self.account_key)
        self.file_system_name = 'datalake'
        self.file_system = FileSystemClient(account_url=self.account_url,file_system_name=self.file_system_name, credential=self.account_key)
    def save_state(self):
        """
        Updates the transformation state file after a successful run. Its important this method gets called
        after the pipeline has run or else the datasource will keep processing already processed files.
        """
        local_client_auth = self.client_auth.get_local_copy()

        with FileSystemClient(self.account_url,
                              self.filesystem_name,
                              credential=local_client_auth.get_credential_sync(
                              )) as filesystem_client:
            state = self.__retrieve_transformation_state(filesystem_client)

            # state file doesn't exist. We create a fresh one.
            if not state:
                state = {}

            if len(self.file_paths) > 0:
                for path in self.file_paths:
                    # Set 'processed' tag in the metadata of the file
                    metadata = {
                        'processed':
                        datetime.utcnow().strftime(self.DATE_FORMAT)
                    }
                    filesystem_client.get_file_client(
                        path.name).set_metadata(metadata)

                # Get the date from the folder structure of the last file it has processed
                date_elements = self.file_paths[-1].name.split('/')[1:-1]
                date_str = ''.join([x.split('=')[1] for x in date_elements])

                latest_folder_date = datetime.strptime(
                    date_str, '%Y%m%d%H').strftime(self.DATE_FORMAT)
                state['last_successful_run'] = latest_folder_date
                state[
                    'last_backfill_start'] = self.last_backfill_start.strftime(
                        self.DATE_FORMAT)
                self.__save_transformation_state(filesystem_client, state)

        self.file_paths = []
def __get_filesystem_client(token: str) -> FileSystemClient:
    account_url = config['Azure Storage']['account_url']
    filesystem_name = config['Azure Storage']['filesystem_name']
    credential = AzureCredential(token)

    return FileSystemClient(account_url, filesystem_name, credential=credential)
Exemplo n.º 7
0
 def _get_container_client(self, storage_account_url: str, file_system: str,
                           credential: Union[DefaultAzureCredential, str]):
     file_system = FileSystemClient(account_url=storage_account_url,
                                    file_system_name=file_system,
                                    credential=credential)
     return file_system