class WritableTextFilesADLSource(base.DataSource):
    """
    A Azure Data Lake source that also implements a "write" method to write
    files to data lake.
    """

    name = 'writableadltext'
    partition_access = True
    version = '0.0.1dev'

    def __init__(self,
                 tenant_id,
                 client_id,
                 client_secret,
                 store_name,
                 metadata=None):
        token = lib.auth(tenant_id=tenant_id,
                         client_id=client_id,
                         client_secret=client_secret)
        self.adl = AzureDLFileSystem(store_name=store_name, token=token)
        super(WritableTextFilesADLSource, self).__init__(metadata=metadata)

    def write(self, local_path, remote_path):
        print('writing')
        self.adl.put(local_path, remote_path)

    def read(self, path):
        with self.adl.open(path, 'rb') as f:
            data = f.read()
            print(data)
Пример #2
0
 def do_connect(self):
     """Establish connection object."""
     token = lib.auth(
         tenant_id=self.tenant_id,
         client_id=self.client_id,
         client_secret=self.client_secret,
     )
     self.azure_fs = AzureDLFileSystem(token=token, store_name=self.store_name)
Пример #3
0
def second_azure():
    from azure.datalake.store import AzureDLFileSystem
    fs = AzureDLFileSystem(token=settings.TOKEN, store_name=settings.STORE_NAME)

    # Clear filesystem cache to ensure we capture all requests from a test
    fs.invalidate_cache()

    yield fs
 def __init__(self,
              tenant_id,
              client_id,
              client_secret,
              store_name,
              metadata=None):
     token = lib.auth(tenant_id=tenant_id,
                      client_id=client_id,
                      client_secret=client_secret)
     self.adl = AzureDLFileSystem(store_name=store_name, token=token)
     super(WritableTextFilesADLSource, self).__init__(metadata=metadata)
Пример #5
0
 def glob(self, path):
     """For a template path, return matching files"""
     adl_path = self._trim_filename(path)
     return [
         'adl://%s.azuredatalakestore.net/%s' % (self.store_name, s)
         for s in AzureDLFileSystem.glob(self, adl_path)
     ]
Пример #6
0
 def __init__(self,
              tenant_id=None,
              client_id=None,
              client_secret=None,
              **kwargs):
     self.tenant_id = tenant_id
     self.client_id = client_id
     self.client_secret = client_secret
     self.kwargs = kwargs
     # self.kwargs['store_name'] = kwargs['host']
     token = lib.auth(
         tenant_id=self.tenant_id,
         client_id=self.client_id,
         client_secret=self.client_secret,
     )
     self.kwargs["token"] = token
     self.fs = AzureDLFileSystem(**self.kwargs)
Пример #7
0
class AzureDatalakeFileSystem(AbstractFileSystem):
    """
    Access Azure Datalake Gen1 as if it were a file system.

    This exposes a filesystem-like API on top of Azure Datalake Storage

    Parameters
    -----------
    tenant_id:  string
        Azure tenant, also known as the subscription id
    client_id: string
        The username or serivceprincipal id
    client_secret: string
        The access key
    store_name: string (optional)
        The name of the datalake account being accessed.  Should be inferred from the urlpath
        if using with Dask read_xxx and to_xxx methods.

    Examples
    --------
    >>> adl = AzureDatalakeFileSystem(tenant_id="xxxx", client_id="xxxx",
    ...                               client_secret="xxxx")

    >>> adl.ls('')

    Sharded Parquet & CSV files can be read as

    >>> storage_options = dict(tennant_id=TENNANT_ID, client_id=CLIENT_ID,
    ...                        client_secret=CLIENT_SECRET)  # doctest: +SKIP
    >>> ddf = dd.read_parquet('adl://store_name/folder/filename.parquet',
    ...                       storage_options=storage_options)  # doctest: +SKIP

    >>> ddf = dd.read_csv('adl://store_name/folder/*.csv'
    ...                   storage_options=storage_options)  # doctest: +SKIP


    Sharded Parquet and CSV files can be written as

    >>> ddf.to_parquet("adl://store_name/folder/filename.parquet",
    ...                storage_options=storage_options)  # doctest: +SKIP

    >>> ddf.to_csv('adl://store_name/folder/*.csv'
    ...            storage_options=storage_options)  # doctest: +SKIP
    """

    protocol = "adl"

    def __init__(self, tenant_id, client_id, client_secret, store_name):
        super().__init__()
        self.tenant_id = tenant_id
        self.client_id = client_id
        self.client_secret = client_secret
        self.store_name = store_name
        self.do_connect()

    @staticmethod
    def _get_kwargs_from_urls(paths):
        """ Get the store_name from the urlpath and pass to storage_options """
        ops = infer_storage_options(paths)
        out = {}
        if ops.get("host", None):
            out["store_name"] = ops["host"]
        return out

    @classmethod
    def _strip_protocol(cls, path):
        ops = infer_storage_options(path)
        return ops["path"]

    def do_connect(self):
        """Establish connection object."""
        token = lib.auth(
            tenant_id=self.tenant_id,
            client_id=self.client_id,
            client_secret=self.client_secret,
        )
        self.azure_fs = AzureDLFileSystem(token=token, store_name=self.store_name)

    def ls(self, path, detail=False, invalidate_cache=True, **kwargs):
        files = self.azure_fs.ls(
            path=path, detail=detail, invalidate_cache=invalidate_cache
        )

        for file in files:
            if "type" in file and file["type"] == "DIRECTORY":
                file["type"] = "directory"

        return files

    def info(self, path, invalidate_cache=True, expected_error_code=404, **kwargs):
        info = self.azure_fs.info(
            path=path,
            invalidate_cache=invalidate_cache,
            expected_error_code=expected_error_code,
        )
        info["size"] = info["length"]
        return info

    def _trim_filename(self, fn, **kwargs):
        """ Determine what kind of filestore this is and return the path """
        so = infer_storage_options(fn)
        fileparts = so["path"]
        return fileparts

    def glob(self, path, details=False, invalidate_cache=True, **kwargs):
        """For a template path, return matching files"""
        adlpaths = self._trim_filename(path)
        filepaths = self.azure_fs.glob(
            adlpaths, details=details, invalidate_cache=invalidate_cache
        )
        return filepaths

    def isdir(self, path, **kwargs):
        """Is this entry directory-like?"""
        try:
            return self.info(path)["type"].lower() == "directory"
        except FileNotFoundError:
            return False

    def isfile(self, path, **kwargs):
        """Is this entry file-like?"""
        try:
            return self.azure_fs.info(path)["type"].lower() == "file"
        except Exception:
            return False

    def _open(
        self,
        path,
        mode="rb",
        block_size=None,
        autocommit=True,
        cache_options=None,
        **kwargs,
    ):
        return AzureDatalakeFile(self, path, mode=mode)

    def read_block(self, fn, offset, length, delimiter=None, **kwargs):
        return self.azure_fs.read_block(fn, offset, length, delimiter)

    def ukey(self, path):
        return tokenize(self.info(path)["modificationTime"])

    def size(self, path):
        return self.info(path)["length"]

    def __getstate__(self):
        dic = self.__dict__.copy()
        logger.debug("Serialize with state: %s", dic)
        return dic

    def __setstate__(self, state):
        logger.debug("De-serialize with state: %s", state)
        self.__dict__.update(state)
        self.do_connect()
Пример #8
0
 def open(self, path, mode='rb'):
     adl_path = self._trim_filename(path)
     f = AzureDLFileSystem.open(self, adl_path, mode=mode)
     return f
Пример #9
0
 def do_connect(self):
     token = lib.auth(tenant_id=self.tenant_id,
                      client_id=self.client_id,
                      client_secret=self.client_secret)
     self.kwargs['token'] = token
     AzureDLFileSystem.__init__(self, **self.kwargs)
Пример #10
0
class AzureDataLakeFileSystem(FileSystemBase):
    fs_cls = AzureDLFileSystem
    scheme = "adl"
    is_remote = True
    supports_scheme = False

    def __init__(self,
                 tenant_id=None,
                 client_id=None,
                 client_secret=None,
                 **kwargs):
        self.tenant_id = tenant_id
        self.client_id = client_id
        self.client_secret = client_secret
        self.kwargs = kwargs
        # self.kwargs['store_name'] = kwargs['host']
        token = lib.auth(
            tenant_id=self.tenant_id,
            client_id=self.client_id,
            client_secret=self.client_secret,
        )
        self.kwargs["token"] = token
        self.fs = AzureDLFileSystem(**self.kwargs)

    def _parse_store_name(self, path):
        from drfs.path import RemotePath

        if not isinstance(path, RemotePath):
            path = RemotePath(path)

        store_name, path = path.hostname, path.path
        if store_name == "":
            raise ValueError(
                "Can't connect without store name. Please provide the path in the "
                "following form: 'adl://STORE_NAME/folder/file.extension'!")
        return store_name, path

    def _connect(self, path):
        self.fs.kwargs["store_name"], path = self._parse_store_name(path)
        self.fs.connect()
        return path

    def _add_store_name(self, p):
        from drfs.path import RemotePath

        parts = p.parts
        part0 = parts[0].split("/")[2]
        drv = parts[0].replace(part0, self.fs.kwargs["store_name"])
        return RemotePath(drv, part0, *parts[1:])

    def ls(self, path, *args, **kwargs):
        path = self._connect(path)
        return [
            self._add_store_name(p) for p in super().ls(path, *args, **kwargs)
        ]

    def open(self, path, *args, **kwargs):
        path = self._connect(path)
        return super().open(path, *args, **kwargs)

    def exists(self, path, *args, **kwargs):
        path = self._connect(path)
        return super().exists(path, *args, **kwargs)

    def remove(self, path, *args, **kwargs):
        path = self._connect(path)
        return super().remove(path, *args, **kwargs)

    def mv(self, path, *args, **kwargs):
        path = self._connect(path)
        return super().mv(path, *args, **kwargs)

    def makedirs(self, path, *args, **kwargs):
        path = self._connect(path)
        return super().makedirs(path, *args, **kwargs)

    def rmdir(self, path, *args, **kwargs):
        path = self._connect(path)
        return super().rmdir(path, *args, **kwargs)

    def info(self, path, *args, **kwargs):
        path = self._connect(path)
        return super().info(path, *args, **kwargs)

    def walk(self, *args, **kwargs):
        arg0 = self._connect(args[0])
        return [
            self._add_store_name(p)
            for p in super().walk(arg0, *args[1:], **kwargs)
        ]

    def glob(self, *args, **kwargs):
        arg0 = self._connect(args[0])
        return [
            self._add_store_name(p)
            for p in super().glob(arg0, *args[1:], **kwargs)
        ]
Пример #11
0
def setup_env(request):
    home = working_dir()
    fs = AzureDLFileSystem(store_name=settings.STORE_NAME, token=settings.TOKEN)
    if settings.RECORD_MODE != 'none':
        if not fs.exists(home):
            fs.mkdir(home)