def test_put_file(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) lfs = LocalFileSystem() fs.mkdir("putdir") # Check that put on an empty file works with open("sample.txt", "wb") as f: f.write(b"") fs.put("sample.txt", "putdir/sample.txt") fs.get("putdir/sample.txt", "sample2.txt") with open("sample.txt", "rb") as f: f1 = f.read() with open("sample2.txt", "rb") as f: f2 = f.read() assert f1 == f2 lfs.rm("sample.txt") lfs.rm("sample2.txt") # Check that put on a file with data works with open("sample3.txt", "wb") as f: f.write(b"01234567890") fs.put("sample3.txt", "putdir/sample3.txt") fs.get("putdir/sample3.txt", "sample4.txt") with open("sample3.txt", "rb") as f: f3 = f.read() with open("sample4.txt", "rb") as f: f4 = f.read() assert f3 == f4 fs.rm("putdir", recursive=True)
def test_mkdir_rm_recursive(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_mkdir_rm_recursive") assert "test_mkdir_rm_recursive/" in fs.ls("") with fs.open("test_mkdir_rm_recursive/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("test_mkdir_rm_recursive/dir/file.txt", "wb") as f: f.write(b"ABCD") with fs.open("test_mkdir_rm_recursive/dir/file2.txt", "wb") as f: f.write(b"abcdef") assert fs.find("test_mkdir_rm_recursive") == [ "test_mkdir_rm_recursive/dir/file.txt", "test_mkdir_rm_recursive/dir/file2.txt", "test_mkdir_rm_recursive/file.txt", ] fs.rm("test_mkdir_rm_recursive", recursive=True) assert "test_mkdir_rm_recursive/" not in fs.ls("") assert fs.find("test_mkdir_rm_recursive") == []
def test_mkdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) # Verify mkdir will create a new container when create_parents is True fs.mkdir("new-container", create_parents=True) assert "new-container" in fs.ls(".") fs.rm("new-container") # Verify a new container will not be created when create_parents # is False with pytest.raises(PermissionError): fs.mkdir("new-container", create_parents=False) # Test creating subdirectory when container does not exist # Since mkdir is a no-op, if create_parents=True, it will create # the top level container, but will NOT create nested directories fs.mkdir("new-container/dir", create_parents=True) assert "new-container/dir" not in fs.ls("new-container") assert "new-container" in fs.ls(".") fs.rm("new-container", recursive=True) # Test that creating a directory when already exists passes fs.mkdir("data") assert "data" in fs.ls(".") # Test raising error when container does not exist with pytest.raises(PermissionError): fs.mkdir("new-container/dir", create_parents=False)
def test_metadata_write(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_metadata_write") data = b"0123456789" metadata = {"meta": "data"} # standard blob type with fs.open("test_metadata_write/file.txt", "wb", metadata=metadata) as f: f.write(data) info = fs.info("test_metadata_write/file.txt") assert info["metadata"] == metadata metadata_changed_on_write = {"meta": "datum"} with fs.open("test_metadata_write/file.txt", "wb", metadata=metadata_changed_on_write) as f: f.write(data) info = fs.info("test_metadata_write/file.txt") assert info["metadata"] == metadata_changed_on_write # append blob type new_metadata = {"data": "meta"} with fs.open("test_metadata_write/append-file.txt", "ab", metadata=metadata) as f: f.write(data) # try change metadata on block appending with fs.open("test_metadata_write/append-file.txt", "ab", metadata=new_metadata) as f: f.write(data) info = fs.info("test_metadata_write/append-file.txt") # azure blob client doesn't seem to support metadata mutation when appending blocks # lets be sure this behavior doesn't change as this would imply # a potential breaking change assert info["metadata"] == metadata # getxattr / setxattr assert fs.getxattr("test_metadata_write/file.txt", "meta") == "datum" fs.setxattrs("test_metadata_write/file.txt", metadata="data2") assert fs.getxattr("test_metadata_write/file.txt", "metadata") == "data2" assert fs.info("test_metadata_write/file.txt")["metadata"] == { "metadata": "data2" } # empty file and nested directory with fs.open("test_metadata_write/a/b/c/nested-file.txt", "wb", metadata=metadata) as f: f.write(b"") assert fs.getxattr("test_metadata_write/a/b/c/nested-file.txt", "meta") == "data" fs.setxattrs("test_metadata_write/a/b/c/nested-file.txt", metadata="data2") assert fs.info( "test_metadata_write/a/b/c/nested-file.txt")["metadata"] == { "metadata": "data2" } fs.rmdir("test_metadata_write")
def test_cat(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("catdir") data = b"0123456789" with fs.open("catdir/catfile.txt", "wb") as f: f.write(data) assert fs.cat("catdir/catfile.txt") == data fs.rm("catdir/catfile.txt")
def test_mkdir_rmdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.mkdir("new-container") assert "new-container/" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Check to verify you can skip making a directory if the container # already exists, but still create a file in that directory fs.mkdir("new-container/dir/file.txt", exist_ok=False) assert "new-container/" in fs.ls("") fs.mkdir("new-container/file2.txt", exist_ok=False) assert "new-container/file2.txt" in fs.ls("new-container") # Test to verify that the file contains expected contents with fs.open("new-container/file2.txt", "rb") as f: outfile = f.read() assert outfile == b"" # Check that trying to overwrite an existing nested file in append mode works as expected fs.mkdir("new-container/dir/file2.txt", exist_ok=False) assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure fs.mkdir("new-container/dir2/file.txt", exist_ok=False) with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container/" not in fs.ls("")
def test_cp_file(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("homedir") fs.mkdir("homedir/enddir") fs.touch("homedir/startdir/test_file.txt") fs.cp_file("homedir/startdir/test_file.txt", "homedir/enddir/test_file.txt") files = fs.ls("homedir/enddir") assert "homedir/enddir/test_file.txt" in files fs.rm("homedir", recursive=True)
def test_mkdir_rmdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.mkdir("new-container") assert "new-container/" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Check to verify you can skip making a directory if the container # already exists, but still create a file in that directory fs.mkdir("new-container/dir/file.txt", exists_ok=True) assert "new-container/" in fs.ls("") fs.mkdir("new-container/file2.txt", exists_ok=True) with fs.open("new-container/file2.txt", "wb") as f: f.write(b"0123456789") assert "new-container/file2.txt" in fs.ls("new-container") fs.mkdir("new-container/dir/file2.txt", exists_ok=True) with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure fs.mkdir("new-container/dir2/file.txt", exists_ok=True) with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container/" not in fs.ls("")
def test_append_operation(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("append-container") # Check that appending to an existing file works as expected with fs.open("append-container/append_file.txt", "ab") as f: f.write(b"0123456789") with fs.open("append-container/append_file.txt", "ab") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "rb") as f: outfile = f.read() assert outfile == b"01234567890123456789" fs.rm("append-container", recursive=True)
def test_url(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, account_key=KEY ) fs.mkdir("catdir") data = b"0123456789" with fs.open("catdir/catfile.txt", "wb") as f: f.write(data) import requests r = requests.get(fs.url("catdir/catfile.txt")) assert r.status_code == 200 assert r.content == data fs.rm("catdir/catfile.txt")
def fs(self): from adlfs import AzureBlobFileSystem from azure.core.exceptions import AzureError try: file_system = AzureBlobFileSystem(**self.login_info) if self.bucket not in [ container.rstrip("/") for container in file_system.ls("/") ]: file_system.mkdir(self.bucket) except (ValueError, AzureError) as e: raise AzureAuthError( f"Authentication to Azure Blob Storage via {self.login_method}" " failed.\nLearn more about configuration settings at" f" {format_link('https://man.dvc.org/remote/modify')}") from e return file_system
def test_dask_parquet(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test") STORAGE_OPTIONS = { "account_name": "devstoreaccount1", "connection_string": CONN_STR, } df = pd.DataFrame({ "col1": [1, 2, 3, 4], "col2": [2, 4, 6, 8], "index_key": [1, 1, 2, 2], "partition_key": [1, 1, 2, 2], }) dask_dataframe = dd.from_pandas(df, npartitions=1) for protocol in ["abfs", "az"]: dask_dataframe.to_parquet( "{}://[email protected]/test_group.parquet".format( protocol), storage_options=STORAGE_OPTIONS, engine="pyarrow", ) fs = AzureBlobFileSystem(**STORAGE_OPTIONS) assert fs.ls("test/test_group.parquet") == [ "test/test_group.parquet/_common_metadata", "test/test_group.parquet/_metadata", "test/test_group.parquet/part.0.parquet", ] fs.rm("test/test_group.parquet") df_test = dd.read_parquet( "abfs://test/test_group.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df, df_test)
def test_deep_paths(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_deep") assert "test_deep/" in fs.ls("") with fs.open("test_deep/a/b/c/file.txt", "wb") as f: f.write(b"0123456789") assert fs.ls("test_deep") == ["test_deep/a/"] assert fs.ls("test_deep/") == ["test_deep/a/"] assert fs.ls("test_deep/a") == ["test_deep/a/b/"] assert fs.ls("test_deep/a/") == ["test_deep/a/b/"] assert fs.find("test_deep") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/a") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/a/") == ["test_deep/a/b/c/file.txt"] fs.rm("test_deep", recursive=True) assert "test_deep/" not in fs.ls("") assert fs.find("test_deep") == []
def test_mkdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) # Verify mkdir will create a new container when create_parents is True fs.mkdir("new-container", create_parents=True) assert "new-container/" in fs.ls(".") fs.rm("new-container") # Verify a new container will not be created when create_parents # is False with pytest.raises(PermissionError): fs.mkdir("new-container", create_parents=False) # Test creating subdirectory when container does not exist fs.mkdir("new-container/dir", create_parents=True) assert "new-container/dir" in fs.ls("new-container") fs.rm("new-container", recursive=True) # Test raising error when container does not exist with pytest.raises(PermissionError): fs.mkdir("new-container/dir", create_parents=False)
def test_dask_parquet(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test") STORAGE_OPTIONS = { "account_name": "devstoreaccount1", "connection_string": CONN_STR, } df = pd.DataFrame({ "col1": [1, 2, 3, 4], "col2": [2, 4, 6, 8], "index_key": [1, 1, 2, 2], "partition_key": [1, 1, 2, 2], }) dask_dataframe = dd.from_pandas(df, npartitions=1) for protocol in ["abfs", "az"]: dask_dataframe.to_parquet( "{}://test/test_group.parquet".format(protocol), storage_options=STORAGE_OPTIONS, engine="pyarrow", ) fs = AzureBlobFileSystem(**STORAGE_OPTIONS) assert fs.ls("test/test_group.parquet") == [ "test/test_group.parquet/_common_metadata", "test/test_group.parquet/_metadata", "test/test_group.parquet/part.0.parquet", ] fs.rm("test/test_group.parquet") df_test = dd.read_parquet( "abfs://test/test_group.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df, df_test) A = np.random.randint(0, 100, size=(10000, 4)) df2 = pd.DataFrame(data=A, columns=list("ABCD")) ddf2 = dd.from_pandas(df2, npartitions=4) dd.to_parquet( ddf2, "abfs://test/test_group2.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.ls("test/test_group2.parquet") == [ "test/test_group2.parquet/_common_metadata", "test/test_group2.parquet/_metadata", "test/test_group2.parquet/part.0.parquet", "test/test_group2.parquet/part.1.parquet", "test/test_group2.parquet/part.2.parquet", "test/test_group2.parquet/part.3.parquet", ] df2_test = dd.read_parquet( "abfs://test/test_group2.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df2, df2_test) a = np.full(shape=(10000, 1), fill_value=1) b = np.full(shape=(10000, 1), fill_value=2) c = np.full(shape=(10000, 1), fill_value=3) d = np.full(shape=(10000, 1), fill_value=4) B = np.concatenate((a, b, c, d), axis=1) df3 = pd.DataFrame(data=B, columns=list("ABCD")) ddf3 = dd.from_pandas(df3, npartitions=4) dd.to_parquet( ddf3, "abfs://test/test_group3.parquet", partition_on=["A", "B"], storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.glob("test/test_group3.parquet/*") == [ "test/test_group3.parquet/A=1", "test/test_group3.parquet/_common_metadata", "test/test_group3.parquet/_metadata", ] df3_test = dd.read_parquet( "abfs://test/test_group3.parquet", filters=[("A", "=", 1)], storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() df3_test = df3_test[["A", "B", "C", "D"]] df3_test = df3_test[["A", "B", "C", "D"]].astype(int) assert_frame_equal(df3, df3_test) A = np.random.randint(0, 100, size=(10000, 4)) df4 = pd.DataFrame(data=A, columns=list("ABCD")) ddf4 = dd.from_pandas(df4, npartitions=4) dd.to_parquet( ddf4, "abfs://test/test_group4.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", flavor="spark", write_statistics=False, ) fs.rmdir("test/test_group4.parquet/_common_metadata", recursive=True) fs.rmdir("test/test_group4.parquet/_metadata", recursive=True) fs.rm("test/test_group4.parquet/_common_metadata") fs.rm("test/test_group4.parquet/_metadata") assert fs.ls("test/test_group4.parquet") == [ "test/test_group4.parquet/part.0.parquet", "test/test_group4.parquet/part.1.parquet", "test/test_group4.parquet/part.2.parquet", "test/test_group4.parquet/part.3.parquet", ] df4_test = dd.read_parquet( "abfs://test/test_group4.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df4, df4_test) A = np.random.randint(0, 100, size=(10000, 4)) df5 = pd.DataFrame(data=A, columns=list("ABCD")) ddf5 = dd.from_pandas(df5, npartitions=4) dd.to_parquet( ddf5, "abfs://test/test group5.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.ls("test/test group5.parquet") == [ "test/test group5.parquet/_common_metadata", "test/test group5.parquet/_metadata", "test/test group5.parquet/part.0.parquet", "test/test group5.parquet/part.1.parquet", "test/test group5.parquet/part.2.parquet", "test/test group5.parquet/part.3.parquet", ] df5_test = dd.read_parquet( "abfs://test/test group5.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df5, df5_test)
def test_large_blob(storage): import tempfile import hashlib import io import shutil from pathlib import Path fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) # create a 20MB byte array, ensure it's larger than blocksizes to force a # chuncked upload blob_size = 120_000_000 # blob_size = 2_684_354_560 assert blob_size > fs.blocksize assert blob_size > AzureBlobFile.DEFAULT_BLOCK_SIZE data = b"1" * blob_size _hash = hashlib.md5(data) expected = _hash.hexdigest() # create container fs.mkdir("chunk-container") # upload the data using fs.open path = "chunk-container/large-blob.bin" with fs.open(path, "ab") as dst: dst.write(data) assert fs.exists(path) assert fs.size(path) == blob_size del data # download with fs.open bio = io.BytesIO() with fs.open(path, "rb") as src: shutil.copyfileobj(src, bio) # read back the data and calculate md5 bio.seek(0) data = bio.read() _hash = hashlib.md5(data) result = _hash.hexdigest() assert expected == result # do the same but using upload/download and a tempdir path = path = "chunk-container/large_blob2.bin" with tempfile.TemporaryDirectory() as td: local_blob: Path = Path(td) / "large_blob2.bin" with local_blob.open("wb") as fo: fo.write(data) assert local_blob.exists() assert local_blob.stat().st_size == blob_size fs.upload(str(local_blob), path) assert fs.exists(path) assert fs.size(path) == blob_size # download now local_blob.unlink() fs.download(path, str(local_blob)) assert local_blob.exists() assert local_blob.stat().st_size == blob_size