def main(args): # distributed setup print("initializing...") dask_mpi.initialize(nthreads=args.cpus_per_node) client = Client() print(client) # get data print("connecting to data...") print(client) container_name = "malware" storage_options = {"account_name": "azuremlexamples"} fs = AzureBlobFileSystem(**storage_options) files = fs.ls(f"{container_name}/processed") # read into dataframes print("creating dataframes...") print(client) for f in files: if "train" in f: df_train = dd.read_parquet(f"az://{f}", storage_options=storage_options) elif "test" in f: df_test = dd.read_parquet(f"az://{f}", storage_options=storage_options) # data processing print("processing data...") print(client) cols = [col for col in df_train.columns if df_train.dtypes[col] != "object"] X = df_train[cols].drop("HasDetections", axis=1).values.persist() y = df_train["HasDetections"].persist() # train xgboost print("training xgboost...") print(client) params = { "objective": "binary:logistic", "learning_rate": args.learning_rate, "gamma": args.gamma, "max_depth": args.max_depth, } mlflow.log_params(params) # log to the run dtrain = xgb.dask.DaskDMatrix(client, X, y) model = xgb.dask.train(client, params, dtrain, num_boost_round=args.num_boost_round) print(model) # predict on test data print("making predictions...") print(client) X_test = df_test[ [col for col in cols if "HasDetections" not in col] ].values.persist() y_pred = xgb.dask.predict(client, model, X_test) y_pred.to_dask_dataframe().to_csv("./outputs/predictions.csv") # save model print("saving model...") print(client) mlflow.xgboost.log_model(model["booster"], "./outputs/model")
def test_open_file(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) f = fs.open("/data/root/a/file.txt") result = f.read() assert result == b"0123456789"
def test_fetch_second_half(storage): # Verify if length extends beyond the end of file, truncate the read fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) blob = fs.open("data/top_file.txt") assert len(blob._fetch_range(start=5, end=10)) == 5
def test_isdir(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) BUCKET = "/name/of/the/bucket" BASE_PATH = BUCKET + "/" + "012345" # EMPTY_DIR = BASE_PATH + "/empty_dir" fs.makedirs(BASE_PATH) assert fs.isdir(BASE_PATH) is True
def test_open_context_manager(storage, mocker): "test closing azure client with context manager" fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) with fs.open("/data/root/a/file.txt") as f: close = mocker.patch.object(f.container_client, "close") result = f.read() assert result == b"0123456789" close.assert_called_once()
def test_open_file(storage, mocker): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) f = fs.open("/data/root/a/file.txt") result = f.read() assert result == b"0123456789" close = mocker.patch.object(f.container_client, "close") f.close() close.assert_called_once()
def test_exists(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) assert fs.exists("data/top_file.txt") assert fs.exists("data") assert fs.exists("data/") assert not fs.exists("non-existent-container") assert not fs.exists("non-existent-container/") assert fs.exists("") assert not fs.exists("data/not-a-key")
def test_cat(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("catdir") data = b"0123456789" with fs.open("catdir/catfile.txt", "wb") as f: f.write(data) assert fs.cat("catdir/catfile.txt") == data fs.rm("catdir/catfile.txt")
def fs(self): from adlfs import AzureBlobFileSystem from azure.core.exceptions import AzureError try: file_system = AzureBlobFileSystem(**self.login_info) if self.bucket not in [ container.rstrip("/") for container in file_system.ls("/") ]: file_system.mkdir(self.bucket) except (ValueError, AzureError) as e: raise AzureAuthError( f"Authentication to Azure Blob Storage via {self.login_method}" " failed.\nLearn more about configuration settings at" f" {format_link('https://man.dvc.org/remote/modify')}") from e return file_system
def setup(self, stage=None): data_dir = "datasets/mnist" storage_options = {"account_name": "azuremlexamples"} fs = AzureBlobFileSystem(**storage_options) files = fs.ls(data_dir) train_len = 60000 test_len = 10000 for f in files: if "train-images" in f: self.X_train = self._read_images(gzip.open(fs.open(f)), train_len) elif "train-labels" in f: self.y_train = self._read_labels(gzip.open(fs.open(f)), train_len) elif "images" in f: self.X_test = self._read_images(gzip.open(fs.open(f)), test_len) elif "labels" in f: self.y_test = self._read_labels(gzip.open(fs.open(f)), test_len) self.ohe = OneHotEncoder().fit(self.y_train.reshape(-1, 1)) self.mnist_train = list( zip( self.X_train, self.ohe.transform(self.y_train.reshape(-1, 1)).toarray(), ) ) self.mnist_test = list( zip(self.X_test, self.ohe.transform(self.y_test.reshape(-1, 1)).toarray(),) )
def test_rm_recursive(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) assert "data/root/c/" in fs.ls("/data/root") assert fs.ls("data/root/c") == [ "data/root/c/file1.txt", "data/root/c/file2.txt", ] fs.rm("data/root/c", recursive=True) assert "data/root/c/" not in fs.ls("/data/root") with pytest.raises(FileNotFoundError): fs.ls("data/root/c")
def test_append_operation(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("append-container") # Check that appending to an existing file works as expected with fs.open("append-container/append_file.txt", "ab") as f: f.write(b"0123456789") with fs.open("append-container/append_file.txt", "ab") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "rb") as f: outfile = f.read() assert outfile == b"01234567890123456789" fs.rm("append-container", recursive=True)
def __init__( self, path: str = None, gen: int = 2, credentials: Dict[str, Any] = None, *args, **kwargs, ): credentials = credentials or local_config.get("AZURE_ADLS") super().__init__(*args, credentials=credentials, **kwargs) storage_account_name = self.credentials["ACCOUNT_NAME"] tenant_id = self.credentials["AZURE_TENANT_ID"] client_id = self.credentials["AZURE_CLIENT_ID"] client_secret = self.credentials["AZURE_CLIENT_SECRET"] self.path = path self.gen = gen self.storage_options = { "tenant_id": tenant_id, "client_id": client_id, "client_secret": client_secret, } if gen == 1: self.fs = AzureDatalakeFileSystem( store_name=storage_account_name, tenant_id=tenant_id, client_id=client_id, client_secret=client_secret, ) self.base_url = f"adl://{storage_account_name}" elif gen == 2: self.storage_options["account_name"] = storage_account_name self.fs = AzureBlobFileSystem( account_name=storage_account_name, tenant_id=tenant_id, client_id=client_id, client_secret=client_secret, ) self.base_url = f"az://"
def fs(self): from adlfs import AzureBlobFileSystem from azure.core.exceptions import AzureError try: return AzureBlobFileSystem(**self.fs_args) except (ValueError, AzureError) as e: raise AzureAuthError( f"Authentication to Azure Blob Storage via {self.login_method}" " failed.\nLearn more about configuration settings at" f" {format_link('https://man.dvc.org/remote/modify')}") from e
def test_rm(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.rm("/data/root/a/file.txt") with pytest.raises(FileNotFoundError): fs.ls("/data/root/a/file.txt", refresh=True)
def test_url(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, account_key=KEY ) fs.mkdir("catdir") data = b"0123456789" with fs.open("catdir/catfile.txt", "wb") as f: f.write(data) import requests r = requests.get(fs.url("catdir/catfile.txt")) assert r.status_code == 200 assert r.content == data fs.rm("catdir/catfile.txt")
def test_dask_parquet(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test") STORAGE_OPTIONS = { "account_name": "devstoreaccount1", "connection_string": CONN_STR, } df = pd.DataFrame({ "col1": [1, 2, 3, 4], "col2": [2, 4, 6, 8], "index_key": [1, 1, 2, 2], "partition_key": [1, 1, 2, 2], }) dask_dataframe = dd.from_pandas(df, npartitions=1) for protocol in ["abfs", "az"]: dask_dataframe.to_parquet( "{}://[email protected]/test_group.parquet".format( protocol), storage_options=STORAGE_OPTIONS, engine="pyarrow", ) fs = AzureBlobFileSystem(**STORAGE_OPTIONS) assert fs.ls("test/test_group.parquet") == [ "test/test_group.parquet/_common_metadata", "test/test_group.parquet/_metadata", "test/test_group.parquet/part.0.parquet", ] fs.rm("test/test_group.parquet") df_test = dd.read_parquet( "abfs://test/test_group.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df, df_test)
def configure_targets(bakery: Bakery, recipe_bakery: RecipeBakery, recipe_name: str, secrets: Dict, extension: str): target = bakery.targets[recipe_bakery.target] repository = os.environ["GITHUB_REPOSITORY"] if target.private.protocol == S3_PROTOCOL: if target.private.storage_options: key = secrets[target.private.storage_options.key] secret = secrets[target.private.storage_options.secret] fs = S3FileSystem( anon=False, default_cache_type="none", default_fill_cache=False, key=key, secret=secret, ) target_path = f"s3://{recipe_bakery.target}/{repository}/{recipe_name}.{extension}" target = FSSpecTarget(fs, target_path) cache_path = f"s3://{recipe_bakery.target}/{repository}/{recipe_name}/cache" cache_target = CacheFSSpecTarget(fs, cache_path) metadata_path = f"s3://{recipe_bakery.target}/{repository}/{recipe_name}/cache/metadata" metadata_target = MetadataTarget(fs, metadata_path) return Targets(target=target, cache=cache_target, metadata=metadata_target) elif target.private.protocol == ABFS_PROTOCOL: if target.private.storage_options: secret = secrets[target.private.storage_options.secret] fs = AzureBlobFileSystem(connection_string=secret) target_path = f"abfs://{recipe_bakery.target}/{repository}/{recipe_name}.{extension}" target = FSSpecTarget(fs, target_path) cache_path = f"abfs://{recipe_bakery.target}/{repository}/{recipe_name}/cache" cache_target = CacheFSSpecTarget(fs, cache_path) metadata_path = ( f"abfs://{recipe_bakery.target}/{repository}/{recipe_name}" "/cache/metadata") metadata_target = MetadataTarget(fs, metadata_path) return Targets(target=target, cache=cache_target, metadata=metadata_target) else: raise UnsupportedTarget
def test_info(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) container_info = fs.info("data") assert container_info == {"name": "data/", "type": "directory", "size": 0} container2_info = fs.info("data/root") assert container2_info == { "name": "data/root/", "type": "directory", "size": 0 } dir_info = fs.info("data/root/c") assert dir_info == {"name": "data/root/c/", "type": "directory", "size": 0} file_info = fs.info("data/root/a/file.txt") assert file_info == { "name": "data/root/a/file.txt", "type": "file", "size": 10 }
def test_deep_paths(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_deep") assert "test_deep/" in fs.ls("") with fs.open("test_deep/a/b/c/file.txt", "wb") as f: f.write(b"0123456789") assert fs.ls("test_deep") == ["test_deep/a/"] assert fs.ls("test_deep/") == ["test_deep/a/"] assert fs.ls("test_deep/a") == ["test_deep/a/b/"] assert fs.ls("test_deep/a/") == ["test_deep/a/b/"] assert fs.find("test_deep") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/a") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/a/") == ["test_deep/a/b/c/file.txt"] fs.rm("test_deep", recursive=True) assert "test_deep/" not in fs.ls("") assert fs.find("test_deep") == []
def test_fetch_first_half(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) blob = fs.open("data/top_file.txt") assert len(blob._fetch_range(start=0, end=5)) == 5
def test_fetch_length_is_none(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) blob = fs.open("data/top_file.txt") assert len(blob._fetch_range(start=2, end=None)) == 8
def test_cp_file(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("homedir") fs.mkdir("homedir/enddir") fs.touch("homedir/startdir/test_file.txt") fs.cp_file("homedir/startdir/test_file.txt", "homedir/enddir/test_file.txt") files = fs.ls("homedir/enddir") assert "homedir/enddir/test_file.txt" in files fs.rm("homedir", recursive=True)
def test_put_file(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) lfs = LocalFileSystem() fs.mkdir("putdir") # Check that put on an empty file works with open("sample.txt", "wb") as f: f.write(b"") fs.put("sample.txt", "putdir/sample.txt") fs.get("putdir/sample.txt", "sample2.txt") with open("sample.txt", "rb") as f: f1 = f.read() with open("sample2.txt", "rb") as f: f2 = f.read() assert f1 == f2 lfs.rm("sample.txt") lfs.rm("sample2.txt") # Check that put on a file with data works with open("sample3.txt", "wb") as f: f.write(b"01234567890") fs.put("sample3.txt", "putdir/sample3.txt") fs.get("putdir/sample3.txt", "sample4.txt") with open("sample3.txt", "rb") as f: f3 = f.read() with open("sample4.txt", "rb") as f: f4 = f.read() assert f3 == f4 fs.rm("putdir", recursive=True)
def test_dask_parquet(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test") STORAGE_OPTIONS = { "account_name": "devstoreaccount1", "connection_string": CONN_STR, } df = pd.DataFrame({ "col1": [1, 2, 3, 4], "col2": [2, 4, 6, 8], "index_key": [1, 1, 2, 2], "partition_key": [1, 1, 2, 2], }) dask_dataframe = dd.from_pandas(df, npartitions=1) for protocol in ["abfs", "az"]: dask_dataframe.to_parquet( "{}://test/test_group.parquet".format(protocol), storage_options=STORAGE_OPTIONS, engine="pyarrow", ) fs = AzureBlobFileSystem(**STORAGE_OPTIONS) assert fs.ls("test/test_group.parquet") == [ "test/test_group.parquet/_common_metadata", "test/test_group.parquet/_metadata", "test/test_group.parquet/part.0.parquet", ] fs.rm("test/test_group.parquet") df_test = dd.read_parquet( "abfs://test/test_group.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df, df_test) A = np.random.randint(0, 100, size=(10000, 4)) df2 = pd.DataFrame(data=A, columns=list("ABCD")) ddf2 = dd.from_pandas(df2, npartitions=4) dd.to_parquet( ddf2, "abfs://test/test_group2.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.ls("test/test_group2.parquet") == [ "test/test_group2.parquet/_common_metadata", "test/test_group2.parquet/_metadata", "test/test_group2.parquet/part.0.parquet", "test/test_group2.parquet/part.1.parquet", "test/test_group2.parquet/part.2.parquet", "test/test_group2.parquet/part.3.parquet", ] df2_test = dd.read_parquet( "abfs://test/test_group2.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df2, df2_test) a = np.full(shape=(10000, 1), fill_value=1) b = np.full(shape=(10000, 1), fill_value=2) c = np.full(shape=(10000, 1), fill_value=3) d = np.full(shape=(10000, 1), fill_value=4) B = np.concatenate((a, b, c, d), axis=1) df3 = pd.DataFrame(data=B, columns=list("ABCD")) ddf3 = dd.from_pandas(df3, npartitions=4) dd.to_parquet( ddf3, "abfs://test/test_group3.parquet", partition_on=["A", "B"], storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.glob("test/test_group3.parquet/*") == [ "test/test_group3.parquet/A=1", "test/test_group3.parquet/_common_metadata", "test/test_group3.parquet/_metadata", ] df3_test = dd.read_parquet( "abfs://test/test_group3.parquet", filters=[("A", "=", 1)], storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() df3_test = df3_test[["A", "B", "C", "D"]] df3_test = df3_test[["A", "B", "C", "D"]].astype(int) assert_frame_equal(df3, df3_test) A = np.random.randint(0, 100, size=(10000, 4)) df4 = pd.DataFrame(data=A, columns=list("ABCD")) ddf4 = dd.from_pandas(df4, npartitions=4) dd.to_parquet( ddf4, "abfs://test/test_group4.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", flavor="spark", write_statistics=False, ) fs.rmdir("test/test_group4.parquet/_common_metadata", recursive=True) fs.rmdir("test/test_group4.parquet/_metadata", recursive=True) fs.rm("test/test_group4.parquet/_common_metadata") fs.rm("test/test_group4.parquet/_metadata") assert fs.ls("test/test_group4.parquet") == [ "test/test_group4.parquet/part.0.parquet", "test/test_group4.parquet/part.1.parquet", "test/test_group4.parquet/part.2.parquet", "test/test_group4.parquet/part.3.parquet", ] df4_test = dd.read_parquet( "abfs://test/test_group4.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df4, df4_test) A = np.random.randint(0, 100, size=(10000, 4)) df5 = pd.DataFrame(data=A, columns=list("ABCD")) ddf5 = dd.from_pandas(df5, npartitions=4) dd.to_parquet( ddf5, "abfs://test/test group5.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.ls("test/test group5.parquet") == [ "test/test group5.parquet/_common_metadata", "test/test group5.parquet/_metadata", "test/test group5.parquet/part.0.parquet", "test/test group5.parquet/part.1.parquet", "test/test group5.parquet/part.2.parquet", "test/test group5.parquet/part.3.parquet", ] df5_test = dd.read_parquet( "abfs://test/test group5.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df5, df5_test)
def test_large_blob(storage): import tempfile import hashlib import io import shutil from pathlib import Path fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) # create a 20MB byte array, ensure it's larger than blocksizes to force a # chuncked upload blob_size = 120_000_000 # blob_size = 2_684_354_560 assert blob_size > fs.blocksize assert blob_size > AzureBlobFile.DEFAULT_BLOCK_SIZE data = b"1" * blob_size _hash = hashlib.md5(data) expected = _hash.hexdigest() # create container fs.mkdir("chunk-container") # upload the data using fs.open path = "chunk-container/large-blob.bin" with fs.open(path, "ab") as dst: dst.write(data) assert fs.exists(path) assert fs.size(path) == blob_size del data # download with fs.open bio = io.BytesIO() with fs.open(path, "rb") as src: shutil.copyfileobj(src, bio) # read back the data and calculate md5 bio.seek(0) data = bio.read() _hash = hashlib.md5(data) result = _hash.hexdigest() assert expected == result # do the same but using upload/download and a tempdir path = path = "chunk-container/large_blob2.bin" with tempfile.TemporaryDirectory() as td: local_blob: Path = Path(td) / "large_blob2.bin" with local_blob.open("wb") as fo: fo.write(data) assert local_blob.exists() assert local_blob.stat().st_size == blob_size fs.upload(str(local_blob), path) assert fs.exists(path) assert fs.size(path) == blob_size # download now local_blob.unlink() fs.download(path, str(local_blob)) assert local_blob.exists() assert local_blob.stat().st_size == blob_size
def test_ls(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) ## these are containers assert fs.ls("") == ["data/"] assert fs.ls("/") == ["data/"] assert fs.ls(".") == ["data/"] ## these are top-level directories and files assert fs.ls("data") == ["data/root/", "data/top_file.txt"] assert fs.ls("/data") == ["data/root/", "data/top_file.txt"] # root contains files and directories assert fs.ls("data/root") == [ "data/root/a/", "data/root/b/", "data/root/c/", "data/root/rfile.txt", ] assert fs.ls("data/root/") == [ "data/root/a/", "data/root/b/", "data/root/c/", "data/root/rfile.txt", ] ## slashes are not not needed, but accepted assert fs.ls("data/root/a") == ["data/root/a/file.txt"] assert fs.ls("data/root/a/") == ["data/root/a/file.txt"] assert fs.ls("/data/root/a") == ["data/root/a/file.txt"] assert fs.ls("/data/root/a/") == ["data/root/a/file.txt"] ## file details assert fs.ls("data/root/a/file.txt", detail=True) == [{ "name": "data/root/a/file.txt", "size": 10, "type": "file" }] # c has two files assert fs.ls("data/root/c", detail=True) == [ { "name": "data/root/c/file1.txt", "size": 10, "type": "file" }, { "name": "data/root/c/file2.txt", "size": 10, "type": "file" }, ] ## if not direct match is found throws error with pytest.raises(FileNotFoundError): fs.ls("not-a-container") with pytest.raises(FileNotFoundError): fs.ls("data/not-a-directory/") with pytest.raises(FileNotFoundError): fs.ls("data/root/not-a-file.txt")
def test_mkdir_rm_recursive(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_mkdir_rm_recursive") assert "test_mkdir_rm_recursive/" in fs.ls("") with fs.open("test_mkdir_rm_recursive/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("test_mkdir_rm_recursive/dir/file.txt", "wb") as f: f.write(b"ABCD") with fs.open("test_mkdir_rm_recursive/dir/file2.txt", "wb") as f: f.write(b"abcdef") assert fs.find("test_mkdir_rm_recursive") == [ "test_mkdir_rm_recursive/dir/file.txt", "test_mkdir_rm_recursive/dir/file2.txt", "test_mkdir_rm_recursive/file.txt", ] fs.rm("test_mkdir_rm_recursive", recursive=True) assert "test_mkdir_rm_recursive/" not in fs.ls("") assert fs.find("test_mkdir_rm_recursive") == []
def test_fetch_entire_blob(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) blob = fs.open("data/top_file.txt") assert len(blob._fetch_range(start=0, length=10)) == 10
def test_connect(storage): AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR)