def test_download_blobxfer(test_output_dirs: TestOutputDirectories, is_file: bool, runner_config: AzureConfig) -> None: """ Test for a bug in early versions of download_blobs: download is happening via prefixes, but because of stripping leading directory names, blobs got overwritten. """ root = Path(test_output_dirs.root_dir) account_key = runner_config.get_dataset_storage_account_key() assert account_key is not None # Expected test data in Azure blobs: # folder1/folder1.txt with content "folder1.txt" # folder1_with_suffix/folder2.txt with content "folder2.txt" # folder1_with_suffix/folder1.txt with content "this comes from folder2" # with bug present, folder1_with_suffix/folder1.txt will overwrite folder1/folder1.txt blobs_root_path = "data-for-testsuite/folder1" if is_file: blobs_root_path += "/folder1.txt" download_blobs(runner_config.datasets_storage_account, account_key, blobs_root_path, root, is_file) folder1 = root / "folder1.txt" assert folder1.exists() if not is_file: otherfile = root / "otherfile.txt" folder2 = root / "folder2.txt" assert folder1.read_text().strip() == "folder1.txt" assert otherfile.exists() assert otherfile.read_text().strip() == "folder1.txt" assert not folder2.exists()
def download_dataset_directory(azure_config: AzureConfig, dataset_dir: str) -> bool: if os.path.isdir(dataset_dir): return False account_key = azure_config.get_dataset_storage_account_key() blobs_root_path = os.path.join(azure_config.datasets_container, os.path.basename(dataset_dir)) + "/" sys.stdout.write(f"Downloading data to {dataset_dir} ...") assert account_key is not None # for mypy download_blobs(azure_config.datasets_storage_account, account_key, blobs_root_path, Path(dataset_dir)) sys.stdout.write("done\n") return True
def download_dataset_via_blobxfer(dataset_id: str, azure_config: AzureConfig, target_folder: Path) -> Optional[Path]: """ Attempts to downloads a dataset from the Azure storage account for datasets, with download happening via blobxfer. This is only possible if the datasets storage account and keyword are present in the `azure_config`. The function returns None if the required settings were not present. :param dataset_id: The folder of the dataset, expected in the container given by azure_config.datasets_container. :param azure_config: The object with all Azure-related settings. :param target_folder: The local folder into which the dataset should be downloaded. :return: The folder that contains the downloaded dataset. Returns None if the datasets account name or password were not present. """ datasets_account_key = azure_config.get_dataset_storage_account_key() if not datasets_account_key: logging.info( "No account key for the dataset storage account was found.") logging.info( f"We checked in environment variables and in the file {PROJECT_SECRETS_FILE}" ) return None if (not azure_config.datasets_container) or ( not azure_config.datasets_storage_account): logging.info("Datasets storage account or container missing.") return None target_folder.mkdir(exist_ok=True) result_folder = target_folder / dataset_id # only download if hasn't already been downloaded if result_folder.is_dir(): logging.info( f"Folder already exists, skipping download: {result_folder}") return result_folder with logging_section(f"Downloading dataset {dataset_id}"): download_blobs( account=azure_config.datasets_storage_account, account_key=datasets_account_key, # When specifying the blobs root path, ensure that there is a slash at the end, otherwise # all datasets with that dataset_id as a prefix get downloaded. blobs_root_path=f"{azure_config.datasets_container}/{dataset_id}/", destination=result_folder) return result_folder