def test_download_single_to_dir(tempdir, azure): with azure_teardown(azure): name = posix(test_dir, 'remote.csv') lines = 100 size, checksum = create_remote_csv(azure, name, 10, 5, lines) fname = os.path.join(tempdir, 'remote.csv') try: down = ADLDownloader(azure, name, tempdir, 1, 2**24, overwrite=True) assert md5sum(fname) == checksum assert os.stat(fname).st_size == size assert linecount(fname) == lines finally: if os.path.isfile(fname): os.remove(fname)
def test_full_read(azure): with azure_teardown(azure): with azure.open(a, 'wb') as f: f.write(b'0123456789') with azure.open(a, 'rb') as f: assert len(f.read(4)) == 4 assert len(f.read(4)) == 4 assert len(f.read(4)) == 2 with azure.open(a, 'rb') as f: assert len(f.read()) == 10 with azure.open(a, 'rb') as f: assert f.tell() == 0 f.seek(3) assert f.read(4) == b'3456' assert f.tell() == 7 assert f.read(4) == b'789' assert f.tell() == 10
def test_filename_specialchar(azure): with azure_teardown(azure): with azure.open(specialCharFile, 'wb') as f: f.write(b'0123456789') with azure.open(specialCharFile, 'rb') as f: assert len(f.read(4)) == 4 assert len(f.read(4)) == 4 assert len(f.read(4)) == 2 with azure.open(specialCharFile, 'rb') as f: assert len(f.read()) == 10 with azure.open(specialCharFile, 'rb') as f: assert f.tell() == 0 f.seek(3) assert f.read(4) == b'3456' assert f.tell() == 7 assert f.read(4) == b'789' assert f.tell() == 10
def test_ls_batched(azure): test_dir = working_dir() / 'abc' azure.mkdir(test_dir) with azure_teardown(azure): test_size = 10 assert azure._ls(test_dir, batch_size=10) == [] create_files(azure, number_of_files=10, prefix='123', root_path=test_dir) with pytest.raises(ValueError): assert len(azure._ls(test_dir, batch_size=1)) == test_size assert len(azure._ls(test_dir, batch_size=9)) == test_size assert len(azure._ls(test_dir, batch_size=10)) == test_size assert len(azure._ls(test_dir, batch_size=11)) == test_size assert len(azure._ls(test_dir, batch_size=2)) == test_size assert len(azure._ls(test_dir, batch_size=100)) == test_size assert len(azure._ls(test_dir)) == test_size
def test_download_single_empty_file(tempdir, azure): with azure_teardown(azure): name = posix(test_dir, 'remote.csv') lines = 0 # the file should have no bytes in it size, checksum = create_remote_csv(azure, name, 10, 5, lines) fname = os.path.join(tempdir, 'local.csv') # single chunk try: down = ADLDownloader(azure, name, fname, 1, size + 10, overwrite=True) assert md5sum(fname) == checksum assert os.stat(fname).st_size == size assert linecount(fname) == lines finally: if os.path.isfile(fname): os.remove(fname)
def test_upload_single_file_in_dir(tempdir, azure): with azure_teardown(azure): lpath_dir = tempdir lfilename = os.path.join(lpath_dir, 'singlefile') with open(lfilename, 'wb') as f: f.write(b'0123456789') # transfer client w/ deterministic temporary directory from azure.datalake.store.multithread import put_chunk client = ADLTransferClient(azure, transfer=put_chunk, unique_temporary=False) up = ADLUploader(azure, test_dir / 'singlefiledir', lpath_dir, nthreads=1, overwrite=True) assert azure.info(test_dir / 'singlefiledir' / 'singlefile')['length'] == 10 azure.rm(test_dir / 'singlefiledir' / 'singlefile')
def test_readlines(azure): with azure_teardown(azure): with azure.open(a, 'wb') as f: f.write(b'123\n456') with azure.open(a, 'rb') as f: lines = f.readlines() assert lines == [b'123\n', b'456'] with azure.open(a, 'rb') as f: assert list(f) == lines with azure.open(a, 'wb') as f: with pytest.raises(ValueError): f.read() bigdata = [b'fe', b'fi', b'fo'] * 1000 with azure.open(a, 'wb') as f: f.write(b'\n'.join(bigdata)) with azure.open(a, 'rb') as f: lines = list(f) assert all(l in [b'fe\n', b'fi\n', b'fo', b'fo\n'] for l in lines)
def test_upload_empty_folder(tempdir, azure): with azure_teardown(azure): os.mkdir(os.path.join(tempdir, "dir1")) os.mkdir(os.path.join(tempdir, "dir1", "b")) with open(os.path.join(tempdir, "dir1", "file.txt"), 'wb') as f: f.write(b'0123456789') # transfer client w/ deterministic temporary directory from azure.datalake.store.multithread import put_chunk client = ADLTransferClient(azure, transfer=put_chunk, unique_temporary=False) # single chunk, empty file up = ADLUploader(azure, test_dir / "dir1", os.path.join(tempdir, "dir1"), nthreads=1, overwrite=True) assert azure.info(test_dir / "dir1" / "b")['type'] == 'DIRECTORY' azure.rm(test_dir / "dir1", recursive=True)
def test_upload_one(local_files, azure): with azure_teardown(azure): bigfile, littlefile, emptyfile, a, b, c = local_files # transfer client w/ deterministic temporary directory from azure.datalake.store.multithread import put_chunk client = ADLTransferClient(azure, transfer=put_chunk, unique_temporary=False) # single chunk up = ADLUploader(azure, test_dir / 'littlefile', littlefile, nthreads=1, overwrite=True) assert azure.info(test_dir / 'littlefile')['length'] == 10 # multiple chunks, one thread size = 10000 up = ADLUploader(azure, test_dir / 'bigfile', bigfile, nthreads=1, chunksize=size//5, client=client, run=False, overwrite=True) up.run() assert azure.info(test_dir / 'bigfile')['length'] == size azure.rm(test_dir / 'bigfile')
def test_upload_glob(tempdir, azure): for directory in ['a', 'b']: d = os.path.join(tempdir, 'data', directory) os.makedirs(d) for data in ['x.csv', 'y.csv', 'z.txt']: with open(os.path.join(d, data), 'wb') as f: f.write(b'0123456789') with azure_teardown(azure): local_path = os.path.join(tempdir, 'data', 'a', '*.csv') up = ADLUploader(azure, test_dir, local_path, run=False, overwrite=True) file_pair_dict = dict(up._file_pairs) assert len(file_pair_dict.keys()) == 2 rfiles = [ posix(AzureDLPath(f).relative_to(test_dir)) for f in file_pair_dict.values() ] assert sorted(rfiles) == sorted(['x.csv', 'y.csv']) local_path = os.path.join(tempdir, 'data', '*', '*.csv') up = ADLUploader(azure, test_dir, local_path, run=False, overwrite=True) file_pair_dict = dict(up._file_pairs) assert len(file_pair_dict.keys()) == 4 rfiles = [ posix(AzureDLPath(f).relative_to(test_dir)) for f in file_pair_dict.values() ] assert sorted(rfiles) == sorted([ posix('a', 'x.csv'), posix('a', 'y.csv'), posix('b', 'x.csv'), posix('b', 'y.csv') ]) local_path = os.path.join(tempdir, 'data', '*', 'z.txt') up = ADLUploader(azure, test_dir, local_path, run=False, overwrite=True) file_pair_dict = dict(up._file_pairs) assert len(file_pair_dict.keys()) == 2 rfiles = [ posix(AzureDLPath(f).relative_to(test_dir)) for f in file_pair_dict.values() ] assert sorted(rfiles) == sorted( [posix('a', 'z.txt'), posix('b', 'z.txt')])
def test_touch_exists(azure): with azure_teardown(azure): azure.touch(a) assert azure.exists(a, invalidate_cache=False)
def test_ls_empty_with_details(azure): with azure_teardown(azure): assert not azure.ls(test_dir, invalidate_cache=False, detail=True)
def test_glob_walk_invalidate_cache(azure): with azure_teardown(azure): azure.mkdir(test_dir / 'c') azure.mkdir(test_dir / 'c' / 'd') filenames = ['a', 'a1', 'a2', 'a3', 'b1', 'c/x1', 'c/x2', 'c/d/x3'] filenames = [test_dir / s for s in filenames] for fn in filenames: azure.touch(fn) assert set(azure.glob(test_dir / 'a*')) == { posix(test_dir / 'a'), posix(test_dir / 'a1'), posix(test_dir / 'a2'), posix(test_dir / 'a3') } assert set(azure.glob(test_dir / 'c' / '*')) == { posix(test_dir / 'c' / 'x1'), posix(test_dir / 'c' / 'x2') } assert (set(azure.glob(test_dir / 'c')) == set( azure.glob(test_dir / 'c' / ''))) assert set(azure.glob(test_dir / 'a')) == {posix(test_dir / 'a')} assert set(azure.glob(test_dir / 'a1')) == {posix(test_dir / 'a1')} assert set(azure.glob(test_dir / '*')) == { posix(test_dir / 'a'), posix(test_dir / 'a1'), posix(test_dir / 'a2'), posix(test_dir / 'a3'), posix(test_dir / 'b1') } assert set(azure.walk(test_dir, invalidate_cache=True)) == { posix(test_dir / 'a'), posix(test_dir / 'a1'), posix(test_dir / 'a2'), posix(test_dir / 'a3'), posix(test_dir / 'b1'), posix(test_dir / 'c' / 'x1'), posix(test_dir / 'c' / 'x2'), posix(test_dir / 'c' / 'd' / 'x3') } assert set(azure.walk(test_dir / 'c', invalidate_cache=True)) == { posix(test_dir / 'c' / 'x1'), posix(test_dir / 'c' / 'x2'), posix(test_dir / 'c' / 'd' / 'x3') } assert set(azure.walk(test_dir / 'c', invalidate_cache=True)) == set( azure.walk(test_dir / 'c', invalidate_cache=True)) # test glob and walk with details=True glob_details = azure.glob(test_dir / '*', details=True, invalidate_cache=True) # validate that the objects are subscriptable assert glob_details[0]['name'] is not None assert glob_details[0]['type'] is not None walk_details = azure.walk(test_dir, details=True, invalidate_cache=True) assert walk_details[0]['name'] is not None assert walk_details[0]['type'] is not None
def test_delimiters_dash(azure): with azure_teardown(azure): write_delimited_data(azure, b'--')
def test_delimiters_newline(azure): with azure_teardown(azure): write_delimited_data(azure, b'\n')
def test_touch_exists(azure): with azure_teardown(azure): azure.touch(a) assert azure.exists(a)