def test_upload_glob(tempdir, azure): for directory in ['a', 'b']: d = os.path.join(tempdir, 'data', directory) os.makedirs(d) for data in ['x.csv', 'y.csv', 'z.txt']: with open(os.path.join(d, data), 'wb') as f: f.write(b'0123456789') with azure_teardown(azure): local_path = os.path.join(tempdir, 'data', 'a', '*.csv') up = ADLUploader(azure, test_dir, local_path, run=False, overwrite=True) file_pair_dict = dict(up._file_pairs) assert len(file_pair_dict.keys()) == 2 rfiles = [posix(AzureDLPath(f).relative_to(test_dir)) for f in file_pair_dict.values()] assert sorted(rfiles) == sorted(['x.csv', 'y.csv']) local_path = os.path.join(tempdir, 'data', '*', '*.csv') up = ADLUploader(azure, test_dir, local_path, run=False, overwrite=True) file_pair_dict = dict(up._file_pairs) assert len(file_pair_dict.keys()) == 4 rfiles = [posix(AzureDLPath(f).relative_to(test_dir)) for f in file_pair_dict.values()] assert sorted(rfiles) == sorted([ posix('a', 'x.csv'), posix('a', 'y.csv'), posix('b', 'x.csv'), posix('b', 'y.csv')]) local_path = os.path.join(tempdir, 'data', '*', 'z.txt') up = ADLUploader(azure, test_dir, local_path, run=False, overwrite=True) file_pair_dict = dict(up._file_pairs) assert len(file_pair_dict.keys()) == 2 rfiles = [posix(AzureDLPath(f).relative_to(test_dir)) for f in file_pair_dict.values()] assert sorted(rfiles) == sorted([posix('a', 'z.txt'), posix('b', 'z.txt')])
def test_upload_overwrite(local_files, azure): bigfile, littlefile, emptyfile, a, b, c = local_files with azure_teardown(azure): # make the file already exist. azure.touch('/{}/littlefile'.format(test_dir.as_posix())) with pytest.raises(OSError) as e: ADLUploader(azure, test_dir, littlefile, nthreads=1) assert test_dir.as_posix() in str(e)
def test_upload_many(local_files, azure): with azure_teardown(azure): bigfile, littlefile, emptyfile, a, b, c = local_files root = os.path.dirname(bigfile) # single thread up = ADLUploader(azure, test_dir, root, nthreads=1, overwrite=True) assert azure.info(test_dir / 'littlefile')['length'] == 10 assert azure.cat(test_dir / 'nested1/nested2/a') == b'0123456789' assert len(azure.du(test_dir, deep=True)) == 6 assert azure.du(test_dir, deep=True, total=True) == 10000 + 40
def upload_to_adls(account_name, source_path, destination_path, thread_count=None, overwrite=False): client = cf_dls_filesystem(account_name) ADLUploader(client, destination_path, source_path, thread_count, overwrite=overwrite)
def test_save_up(local_files, azure): bigfile, littlefile, emptyfile, a, b, c = local_files root = os.path.dirname(bigfile) up = ADLUploader(azure, '', root, 1, 1000000, run=False, overwrite=True) up.save() alluploads = ADLUploader.load() assert up.hash in alluploads up.save(keep=False) alluploads = ADLUploader.load() assert up.hash not in alluploads
def test_upload_overwrite(local_files, azure): bigfile, littlefile, emptyfile, a, b, c = local_files with azure_teardown(azure): # create the folder that we want to make sure the overwrite # test fails on if it doesn't already exist if not azure.exists(test_dir): azure.mkdir(test_dir) with pytest.raises(OSError) as e: ADLUploader(azure, test_dir, littlefile, nthreads=1) assert test_dir.as_posix() in str(e)
def test_upload_one(local_files, azure): with azure_teardown(azure): bigfile, littlefile, emptyfile, a, b, c = local_files # transfer client w/ deterministic temporary directory from azure.datalake.store.multithread import put_chunk client = ADLTransferClient(azure, transfer=put_chunk, unique_temporary=False) # single chunk up = ADLUploader(azure, test_dir / 'littlefile', littlefile, nthreads=1, overwrite=True) assert azure.info(test_dir / 'littlefile')['length'] == 10 # multiple chunks, one thread size = 10000 up = ADLUploader(azure, test_dir / 'bigfile', bigfile, nthreads=1, chunksize=size // 5, client=client, run=False, overwrite=True) up.run() assert azure.info(test_dir / 'bigfile')['length'] == size azure.rm(test_dir / 'bigfile')
def do_put(self, line): parser = argparse.ArgumentParser(prog="put", add_help=False) parser.add_argument('local_path', type=str) parser.add_argument('remote_path', type=str, nargs='?', default='.') parser.add_argument('-b', '--chunksize', type=int, default=2**28) parser.add_argument('-c', '--threads', type=int, default=None) parser.add_argument('-f', '--force', action='store_true') try: args = parser.parse_args(line.split()) except: pass ADLUploader(self._fs, args.remote_path, args.local_path, nthreads=args.threads, chunksize=args.chunksize, overwrite=args.force)
def upload_to_adls(cmd, account_name, source_path, destination_path, chunk_size, buffer_size, block_size, thread_count=None, overwrite=False, progress_callback=None): client = cf_dls_filesystem(cmd.cli_ctx, account_name) ADLUploader( client, destination_path, source_path, thread_count, chunksize=chunk_size, buffersize=buffer_size, blocksize=block_size, overwrite=overwrite, progress_callback=progress_callback or get_update_progress(cmd.cli_ctx))
def test_upload_one_empty_file(local_files, azure): with azure_teardown(azure): bigfile, littlefile, emptyfile, a, b, c = local_files # transfer client w/ deterministic temporary directory from azure.datalake.store.multithread import put_chunk client = ADLTransferClient(azure, transfer=put_chunk, unique_temporary=False) # single chunk, empty file up = ADLUploader(azure, test_dir / 'emptyfile', emptyfile, nthreads=1, overwrite=True) assert azure.info(test_dir / 'emptyfile')['length'] == 0 azure.rm(test_dir / 'emptyfile')
def test_upload_single_file_in_dir(tempdir, azure): with azure_teardown(azure): lpath_dir = tempdir lfilename = os.path.join(lpath_dir, 'singlefile') with open(lfilename, 'wb') as f: f.write(b'0123456789') # transfer client w/ deterministic temporary directory from azure.datalake.store.multithread import put_chunk client = ADLTransferClient(azure, transfer=put_chunk, unique_temporary=False) up = ADLUploader(azure, test_dir / 'singlefiledir', lpath_dir, nthreads=1, overwrite=True) assert azure.info(test_dir / 'singlefiledir' / 'singlefile')['length'] == 10 azure.rm(test_dir / 'singlefiledir' / 'singlefile')
def test_upload_empty_folder(tempdir, azure): with azure_teardown(azure): os.mkdir(os.path.join(tempdir, "dir1")) os.mkdir(os.path.join(tempdir, "dir1", "b")) with open(os.path.join(tempdir, "dir1", "file.txt"), 'wb') as f: f.write(b'0123456789') # transfer client w/ deterministic temporary directory from azure.datalake.store.multithread import put_chunk client = ADLTransferClient(azure, transfer=put_chunk, unique_temporary=False) # single chunk, empty file up = ADLUploader(azure, test_dir / "dir1", os.path.join(tempdir, "dir1") , nthreads=1, overwrite=True) assert azure.info(test_dir / "dir1" /"b")['type'] == 'DIRECTORY' azure.rm(test_dir / "dir1", recursive=True)
def upload_to_adls(account_name, source_path, destination_path, chunk_size, buffer_size, block_size, thread_count=None, overwrite=False): client = cf_dls_filesystem(account_name) ADLUploader( client, destination_path, source_path, thread_count, chunksize=chunk_size, buffersize=buffer_size, blocksize=block_size, overwrite=overwrite)
def do_resume_upload(self, line): try: up = ADLUploader.load()[line] up.run() except KeyError: print("No such upload")
def do_list_uploads(self, line): print(ADLUploader.load())
def do_clear_uploads(self, line): ADLUploader.clear_saved()
import time from azure.datalake.store.multithread import ADLUploader def abc(): xyz() def xyz(): cat() def cat(): time.sleep(100) abc() x = ADLUploader() print(x.clear_saved())