Exemplo n.º 1
0
def test_dask_parquet(storage):
    fs = adlfs.AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR
    )
    fs.mkdir("test")
    STORAGE_OPTIONS = {
        "account_name": "devstoreaccount1",
        "connection_string": CONN_STR,
    }
    df = pd.DataFrame(
        {
            "col1": [1, 2, 3, 4],
            "col2": [2, 4, 6, 8],
            "index_key": [1, 1, 2, 2],
            "partition_key": [1, 1, 2, 2],
        }
    )

    dask_dataframe = dd.from_pandas(df, npartitions=1)
    dask_dataframe.to_parquet(
        "abfs://test/test_group", storage_options=STORAGE_OPTIONS, engine="pyarrow"
    )
    fs = adlfs.AzureBlobFileSystem(**STORAGE_OPTIONS)
    assert fs.ls("test") == [
        "test/test_group/_common_metadata",
        "test/test_group/_metadata",
        "test/test_group/part.0.parquet",
    ]
Exemplo n.º 2
0
def test_ls(storage):
    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        "data",
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )
    assert fs.ls("") == ["root/a/", "root/b/", "root/c/"]
    assert fs.ls("root/") == ["root/a/", "root/b/", "root/c/"]
    assert fs.ls("root") == ["root/a/", "root/b/", "root/c/"]
    assert fs.ls("root/a/") == ["root/a/file.txt"]
    assert fs.ls("root/a") == ["root/a/file.txt"]

    assert fs.ls("root/a/file.txt", detail=True) == [{
        "name": "root/a/file.txt",
        "size": 10,
        "container_name": "data",
        "type": "file",
    }]
    assert fs.ls("root/a", detail=True) == [{
        "name": "root/a/file.txt",
        "size": 10,
        "container_name": "data",
        "type": "file",
    }]
    assert fs.ls("root/a/", detail=True) == [{
        "name": "root/a/file.txt",
        "size": 10,
        "container_name": "data",
        "type": "file",
    }]
Exemplo n.º 3
0
def test_mkdir_rm_recursive(storage):
    fs = adlfs.AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR
    )

    fs.mkdir("test_mkdir_rm_recursive")
    assert "test_mkdir_rm_recursive/" in fs.ls("")

    with fs.open("test_mkdir_rm_recursive/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("test_mkdir_rm_recursive/dir/file.txt", "wb") as f:
        f.write(b"ABCD")

    with fs.open("test_mkdir_rm_recursive/dir/file2.txt", "wb") as f:
        f.write(b"abcdef")

    assert fs.find("test_mkdir_rm_recursive") == [
        "test_mkdir_rm_recursive/dir/file.txt",
        "test_mkdir_rm_recursive/dir/file2.txt",
        "test_mkdir_rm_recursive/file.txt",
    ]

    fs.rm("test_mkdir_rm_recursive", recursive=True)

    assert "test_mkdir_rm_recursive/" not in fs.ls("")
    assert fs.find("test_mkdir_rm_recursive") == []
Exemplo n.º 4
0
def test_connect(storage):
    adlfs.AzureBlobFileSystem(
        storage.account_name,
        "data",
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )
Exemplo n.º 5
0
def test_mkdir_rmdir(storage):
    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )

    fs.mkdir("new-container")
    assert "new-container/" in fs.ls("")

    with fs.open("new-container/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    fs.rm("new-container/dir", recursive=True)
    assert fs.ls("new-container") == ["new-container/file.txt"]

    fs.rm("new-container/file.txt")
    fs.rmdir("new-container")

    assert "new-container/" not in fs.ls("")
Exemplo n.º 6
0
def test_open_file(storage):
    fs = adlfs.AzureBlobFileSystem(account_name=storage.account_name,
                                   connection_string=CONN_STR)
    f = fs.open("/data/root/a/file.txt")

    result = f.read()
    assert result == b"0123456789"
Exemplo n.º 7
0
def test_rm(storage):
    fs = adlfs.AzureBlobFileSystem(account_name=storage.account_name,
                                   connection_string=CONN_STR)

    fs.rm("/data/root/a/file.txt")

    with pytest.raises(FileNotFoundError):
        fs.ls("/data/root/a/file.txt")
Exemplo n.º 8
0
def test_find(storage):
    fs = adlfs.AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR
    )

    ## just the directory name
    assert fs.find("data/root/a") == ["data/root/a/file.txt"]  # NOQA
    assert fs.find("data/root/a/") == ["data/root/a/file.txt"]  # NOQA

    assert fs.find("data/root/c") == [
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
    ]
    assert fs.find("data/root/c/") == [
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
    ]

    ## all files
    assert fs.find("data/root") == [
        "data/root/a/file.txt",
        "data/root/b/file.txt",
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
        "data/root/rfile.txt",
    ]
    assert fs.find("data/root", withdirs=False) == [
        "data/root/a/file.txt",
        "data/root/b/file.txt",
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
        "data/root/rfile.txt",
    ]

    # all files and directories
    assert fs.find("data/root", withdirs=True) == [
        "data/root/a",
        "data/root/a/file.txt",
        "data/root/b",
        "data/root/b/file.txt",
        "data/root/c",
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
        "data/root/rfile.txt",
    ]
    assert fs.find("data/root/", withdirs=True) == [
        "data/root/a",
        "data/root/a/file.txt",
        "data/root/b",
        "data/root/b/file.txt",
        "data/root/c",
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
        "data/root/rfile.txt",
    ]

    ## missing
    assert fs.find("data/missing") == []
Exemplo n.º 9
0
def test_ls(storage):
    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        "data",
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )
    assert fs.ls("/") == ["root"]
    assert fs.ls("/root/a/") == ["file.txt"]
Exemplo n.º 10
0
def test_glob(storage):
    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )

    ## just the directory name
    assert fs.glob("data/root") == ["data/root"]
    ## top-level contents of a directory
    assert fs.glob("data/root/") == [
        "data/root/a",
        "data/root/b",
        "data/root/c",
        "data/root/rfile.txt",
    ]
    assert fs.glob("data/root/*") == [
        "data/root/a",
        "data/root/b",
        "data/root/c",
        "data/root/rfile.txt",
    ]

    assert fs.glob("data/root/b/*") == ["data/root/b/file.txt"]

    ## across directories
    assert fs.glob("data/root/*/file.txt") == [
        "data/root/a/file.txt",
        "data/root/b/file.txt",
    ]

    ## regex match
    assert fs.glob("data/root/*/file[0-9].txt") == [
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
    ]

    ## text files
    assert fs.glob("data/root/*/file*.txt") == [
        "data/root/a/file.txt",
        "data/root/b/file.txt",
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
    ]

    ## all text files
    assert fs.glob("data/**/*.txt") == [
        "data/root/a/file.txt",
        "data/root/b/file.txt",
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
        "data/root/rfile.txt",
    ]

    ## missing
    assert fs.glob("data/missing/*") == []
Exemplo n.º 11
0
def test_open_file(storage):
    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )
    f = fs.open("/data/root/a/file.txt")

    result = f.read()
    assert result == b"0123456789"
Exemplo n.º 12
0
def test_info(storage):
    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        "data",
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )
    assert fs.info("root/a/file.txt")["name"] == "root/a/file.txt"
    assert fs.info("root/a/file.txt")["container_name"] == "data"
    assert fs.info("root/a/file.txt")["type"] == "file"
    assert fs.info("root/a/file.txt")["size"] == 10
Exemplo n.º 13
0
def test_rm(storage):
    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )

    fs.rm("/data/root/a/file.txt")

    with pytest.raises(FileNotFoundError):
        fs.ls("/data/root/a/file.txt")
Exemplo n.º 14
0
def test_glob(storage):
    fs = adlfs.AzureBlobFileSystem(account_name=storage.account_name,
                                   connection_string=CONN_STR)

    ## just the directory name
    assert fs.glob("data/root") == ["data/root"]
    ## top-level contents of a directory
    assert fs.glob("data/root/") == [
        "data/root/a",
        "data/root/b",
        "data/root/c",
        "data/root/rfile.txt",
    ]
    assert fs.glob("data/root/*") == [
        "data/root/a",
        "data/root/b",
        "data/root/c",
        "data/root/rfile.txt",
    ]

    assert fs.glob("data/root/b/*") == ["data/root/b/file.txt"]  # NOQA

    ## across directories
    assert fs.glob("data/root/*/file.txt") == [
        "data/root/a/file.txt",
        "data/root/b/file.txt",
    ]

    ## regex match
    assert fs.glob("data/root/*/file[0-9].txt") == [
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
    ]

    ## text files
    assert fs.glob("data/root/*/file*.txt") == [
        "data/root/a/file.txt",
        "data/root/b/file.txt",
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
    ]

    ## all text files
    assert fs.glob("data/**/*.txt") == [
        "data/root/a/file.txt",
        "data/root/b/file.txt",
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
        "data/root/rfile.txt",
    ]

    ## missing
    assert fs.glob("data/missing/*") == []
Exemplo n.º 15
0
def test_ls(storage):
    fs = adlfs.AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR
    )

    ## these are containers
    assert fs.ls("") == ["data/"]
    assert fs.ls("/") == ["data/"]

    ## these are top-level directories and files
    assert fs.ls("data") == ["data/root/", "data/top_file.txt"]
    assert fs.ls("/data") == ["data/root/", "data/top_file.txt"]

    # root contains files and directories
    assert fs.ls("data/root") == [
        "data/root/a/",
        "data/root/b/",
        "data/root/c/",
        "data/root/rfile.txt",
    ]
    assert fs.ls("data/root/") == [
        "data/root/a/",
        "data/root/b/",
        "data/root/c/",
        "data/root/rfile.txt",
    ]

    ## slashes are not not needed, but accepted
    assert fs.ls("data/root/a") == ["data/root/a/file.txt"]
    assert fs.ls("data/root/a/") == ["data/root/a/file.txt"]
    assert fs.ls("/data/root/a") == ["data/root/a/file.txt"]
    assert fs.ls("/data/root/a/") == ["data/root/a/file.txt"]

    ## file details
    assert fs.ls("data/root/a/file.txt", detail=True) == [
        {"name": "data/root/a/file.txt", "size": 10, "type": "file"}
    ]

    ## c has two files
    assert fs.ls("data/root/c", detail=True) == [
        {"name": "data/root/c/file1.txt", "size": 10, "type": "file"},
        {"name": "data/root/c/file2.txt", "size": 10, "type": "file"},
    ]

    ## if not direct match is found throws error
    with pytest.raises(FileNotFoundError):
        fs.ls("not-a-container")

    with pytest.raises(FileNotFoundError):
        fs.ls("data/not-a-directory/")

    with pytest.raises(FileNotFoundError):
        fs.ls("data/root/not-a-file.txt")
Exemplo n.º 16
0
def test_glob(storage):
    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        "data",
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )
    assert fs.glob("root/a/file.txt") == ["root/a/file.txt"]
    assert fs.glob("root/a/") == ["root/a/file.txt"]
    assert fs.glob("root/a") == ["root/a"]
    assert fs.glob("root/") == ["root/a", "root/b", "root/c"]
    assert fs.glob("root/*") == ["root/a", "root/b", "root/c"]
    assert fs.glob("root/c/*.txt") == ["root/c/file1.txt", "root/c/file2.txt"]
Exemplo n.º 17
0
def test_mkdir_rmdir(storage):
    fs = adlfs.AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR
    )

    fs.mkdir("new-container")
    assert "new-container/" in fs.ls("")
    assert fs.ls("new-container") == []

    with fs.open("new-container/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    # Check to verify you can skip making a directory if the container
    # already exists, but still create a file in that directory
    fs.mkdir("new-container/dir/file.txt", exists_ok=True)
    assert "new-container/" in fs.ls("")

    fs.mkdir("new-container/file2.txt", exists_ok=True)
    with fs.open("new-container/file2.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/file2.txt" in fs.ls("new-container")

    fs.mkdir("new-container/dir/file2.txt", exists_ok=True)
    with fs.open("new-container/dir/file2.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir/file2.txt" in fs.ls("new-container/dir")

    # Also verify you can make a nested directory structure
    fs.mkdir("new-container/dir2/file.txt", exists_ok=True)
    with fs.open("new-container/dir2/file.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2")
    fs.rm("new-container/dir2", recursive=True)

    fs.rm("new-container/dir", recursive=True)
    assert fs.ls("new-container") == [
        "new-container/file.txt",
        "new-container/file2.txt",
    ]

    fs.rm("new-container/file.txt")
    fs.rm("new-container/file2.txt")
    fs.rmdir("new-container")

    assert "new-container/" not in fs.ls("")
Exemplo n.º 18
0
def test_rm_recursive(storage):
    fs = adlfs.AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR
    )

    assert "data/root/c/" in fs.ls("/data/root")
    assert fs.ls("data/root/c") == [
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
    ]
    fs.rm("data/root/c", recursive=True)
    assert "data/root/c/" not in fs.ls("/data/root")

    with pytest.raises(FileNotFoundError):
        fs.ls("data/root/c")
Exemplo n.º 19
0
def test_info(storage):
    fs = adlfs.AzureBlobFileSystem(account_name=storage.account_name,
                                   connection_string=CONN_STR)

    container_info = fs.info("data")
    assert container_info == {"name": "data/", "type": "directory", "size": 0}

    dir_info = fs.info("data/root/c")
    assert dir_info == {"name": "data/root/c/", "type": "directory", "size": 0}

    file_info = fs.info("data/root/a/file.txt")
    assert file_info == {
        "name": "data/root/a/file.txt",
        "type": "file",
        "size": 10
    }
Exemplo n.º 20
0
def test_info(storage):
    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )

    container_info = fs.info("data")
    assert container_info == {"name": "data/", "type": "directory", "size": 0}

    dir_info = fs.info("data/root/c")
    assert dir_info == {"name": "data/root/c/", "type": "directory", "size": 0}
    file_info = fs.info("data/root/a/file.txt")
    assert file_info == {
        "name": "data/root/a/file.txt",
        "type": "file",
        "size": 10
    }
Exemplo n.º 21
0
    def __init__(self, config):
        try:
            import adlfs
        except ModuleNotFoundError:
            raise ModuleNotFoundError("Azure Blob package store requires adlfs module")

        self.storage_account_name = config.get('account_name')
        self.access_key = config.get("account_access_key")
        self.conn_string = config.get("conn_str")

        self.fs = adlfs.AzureBlobFileSystem(
            account_name=self.storage_account_name,
            connection_string=self.conn_string,
            account_key=self.access_key,
        )

        self.container_prefix = config['container_prefix']
        self.container_suffix = config['container_suffix']
Exemplo n.º 22
0
def read_parquet(uri, azure_account_name=None, azure_account_key=None):
    parsed_uri = urlparse(uri)
    if parsed_uri.scheme == "file":
        return pd.read_parquet(parsed_uri.path)
    elif parsed_uri.scheme == "gs":
        fs = gcsfs.GCSFileSystem()
        files = ["gs://" + path for path in fs.glob(uri + "/part-*")]
        ds = parquet.ParquetDataset(files, filesystem=fs)
        return ds.read().to_pandas()
    elif parsed_uri.scheme == "s3" or parsed_uri.scheme == "s3a":

        s3uri = urlunparse(parsed_uri._replace(scheme="s3"))

        import s3fs

        # AWS_S3_ENDPOINT_URL needs to be set when using minio
        if "AWS_S3_ENDPOINT_URL" in os.environ:
            fs = s3fs.S3FileSystem(
                client_kwargs={
                    "endpoint_url": os.getenv("AWS_S3_ENDPOINT_URL")
                })
        else:
            fs = s3fs.S3FileSystem()
        files = ["s3://" + path for path in fs.glob(s3uri + "/part-*")]
        ds = parquet.ParquetDataset(files, filesystem=fs)
        return ds.read().to_pandas()
    elif parsed_uri.scheme == "wasbs":
        import adlfs

        fs = adlfs.AzureBlobFileSystem(account_name=azure_account_name,
                                       account_key=azure_account_key)
        uripath = parsed_uri.username + parsed_uri.path
        files = fs.glob(uripath + "/part-*")
        ds = parquet.ParquetDataset(files, filesystem=fs)
        return ds.read().to_pandas()
    else:
        raise ValueError(f"Unsupported URL scheme {uri}")
Exemplo n.º 23
0
def test_deep_paths(storage):
    fs = adlfs.AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR
    )

    fs.mkdir("test_deep")
    assert "test_deep/" in fs.ls("")

    with fs.open("test_deep/a/b/c/file.txt", "wb") as f:
        f.write(b"0123456789")

    assert fs.ls("test_deep") == ["test_deep/a/"]
    assert fs.ls("test_deep/") == ["test_deep/a/"]
    assert fs.ls("test_deep/a") == ["test_deep/a/b/"]
    assert fs.ls("test_deep/a/") == ["test_deep/a/b/"]
    assert fs.find("test_deep") == ["test_deep/a/b/c/file.txt"]
    assert fs.find("test_deep/") == ["test_deep/a/b/c/file.txt"]
    assert fs.find("test_deep/a") == ["test_deep/a/b/c/file.txt"]
    assert fs.find("test_deep/a/") == ["test_deep/a/b/c/file.txt"]

    fs.rm("test_deep", recursive=True)

    assert "test_deep/" not in fs.ls("")
    assert fs.find("test_deep") == []
Exemplo n.º 24
0
def test_ls0(storage):
    fs = adlfs.AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR
    )
    assert fs.ls(".") == ["data/"]
Exemplo n.º 25
0
def test_connect(storage):
    adlfs.AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )
Exemplo n.º 26
0
def test_find_missing(storage):
    fs = adlfs.AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR
    )
    assert fs.find("data/roo") == []
Exemplo n.º 27
0
def test_large_blob(storage):
    import tempfile
    import hashlib
    import io
    import shutil
    from pathlib import Path

    fs = adlfs.AzureBlobFileSystem(
        storage.account_name,
        storage.account_key,
        custom_domain=f"http://{storage.primary_endpoint}",
    )

    # create a 20MB byte array, ensure it's larger than blocksizes to force a
    # chuncked upload
    blob_size = 20_000_000
    assert blob_size > fs.blocksize
    assert blob_size > adlfs.AzureBlobFile.DEFAULT_BLOCK_SIZE

    data = b"1" * blob_size
    _hash = hashlib.md5(data)
    expected = _hash.hexdigest()

    # create container
    fs.mkdir("chunk-container")

    # upload the data using fs.open
    path = "chunk-container/large-blob.bin"
    with fs.open(path, "wb") as dst:
        dst.write(data)

    assert fs.exists(path)
    assert fs.size(path) == blob_size

    del data

    # download with fs.open
    bio = io.BytesIO()
    with fs.open(path, "rb") as src:
        shutil.copyfileobj(src, bio)

    # read back the data and calculate md5
    bio.seek(0)
    data = bio.read()
    _hash = hashlib.md5(data)
    result = _hash.hexdigest()

    assert expected == result

    # do the same but using upload/download and a tempdir
    path = path = "chunk-container/large_blob2.bin"
    with tempfile.TemporaryDirectory() as td:
        local_blob: Path = Path(td) / "large_blob2.bin"
        with local_blob.open("wb") as fo:
            fo.write(data)
        assert local_blob.exists()
        assert local_blob.stat().st_size == blob_size

        fs.upload(str(local_blob), path)
        assert fs.exists(path)
        assert fs.size(path) == blob_size

        # download now
        local_blob.unlink()
        fs.download(path, str(local_blob))
        assert local_blob.exists()
        assert local_blob.stat().st_size == blob_size