def test_objects_to_push_deduped(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content dup") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content dup") helper_append_file(manifest.cache_mgr.cache_root, revision, "test3.txt", "test content dup") helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test4.txt", "test content 4") manifest.sweep_all_changes() # Write a .DS_Store file in the objects dir to make sure it gets skipped with open( os.path.join(manifest.cache_mgr.cache_root, 'objects', '.push', '.DS_Store'), 'wt') as ff: ff.write("") obj_to_push = iom.objects_to_push(remove_duplicates=True) assert len(obj_to_push) == 2 assert obj_to_push[0].dataset_path == "other_dir/test4.txt" assert obj_to_push[1].dataset_path == "test1.txt" assert iom.num_objects_to_push(remove_duplicates=True) == 2
def test_objects_to_push(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test4.txt", "test content 4") manifest.sweep_all_changes() # Modify file to have 2 objects with same key helper_append_file(manifest.cache_mgr.cache_root, iom.manifest.dataset_revision, "test2.txt", "test content 22") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 4 assert obj_to_push[0].dataset_path == "other_dir/test4.txt" assert obj_to_push[1].dataset_path == "test1.txt" assert obj_to_push[2].dataset_path == "test2.txt" assert obj_to_push[3].dataset_path == "test2.txt" assert obj_to_push[2].revision != obj_to_push[3].revision assert iom.num_objects_to_push() == 4
def test_push_objects_with_failure(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj2 = obj_to_push[1].object_path.rsplit('/', 1) with aioresponses() as mocked_responses: mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}', payload={ "presigned_url": f"https://dummyurl.com/{obj1}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj1, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1", payload={}, status=200) mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}', payload={ "presigned_url": f"https://dummyurl.com/{obj2}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj2, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1", payload={}, status=400) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.push_objects() assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 1 assert len(result.failure) == 1 assert result.success[0].object_path == obj_to_push[0].object_path assert result.failure[0].object_path == obj_to_push[1].object_path
def test_objects_to_push_ignore_other_branch(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "fdsfgfd") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 assert obj_to_push[0].dataset_path == "test1.txt" assert obj_to_push[1].dataset_path == "test2.txt" # Create new branch and add a file there bm = BranchManager(ds, username=USERNAME) starting_branch = bm.active_branch bm.create_branch(title="test-branch") assert bm.active_branch == "test-branch" assert ds.is_repo_clean is True helper_append_file(manifest.cache_mgr.cache_root, iom.manifest.dataset_revision, "test3.txt", "fdsfgfd") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 3 assert obj_to_push[0].dataset_path == "test1.txt" assert obj_to_push[1].dataset_path == "test2.txt" assert obj_to_push[2].dataset_path == "test3.txt" # Go back to original branch, you shouldn't have to push file on other branch bm.workon_branch(starting_branch) obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 assert obj_to_push[0].dataset_path == "test1.txt" assert obj_to_push[1].dataset_path == "test2.txt"
def test_pull_objects_all_partial_download(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test3.txt", "1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 3 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) _, obj_id_3 = obj_to_push[2].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj3_target = obj_to_push[2].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) assert "test3.txt" in obj_to_push[0].dataset_path assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True assert os.path.exists(obj3_target) is True # Completely remove other_dir/test3.txt object os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "other_dir", "test3.txt")) helper_compress_file(obj1_target, obj1_source) # Remove link for test1.txt os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt")) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is True assert os.path.isfile(obj3_target) is True with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.pull_all() assert len(result.success) == 1 assert len(result.failure) == 0 assert result.success[0].object_path == obj1_target assert "test3.txt" in result.success[0].dataset_path assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True assert os.path.isfile(obj3_target) is True filename = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "other_dir", "test3.txt") assert os.path.isfile(filename) is True with open(filename, 'rt') as dd: assert dd.read() == "1" filename = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt") assert os.path.isfile(filename) is True with open(filename, 'rt') as dd: assert dd.read() == "test content 1" filename = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test2.txt") assert os.path.isfile(filename) is True with open(filename, 'rt') as dd: assert dd.read() == "test content 2" # Try pulling all again with nothing to pull result = iom.pull_all() assert len(result.success) == 0 assert len(result.failure) == 0 assert result.message == "Dataset already downloaded."
def test_pull_objects_all(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) check_info = {obj1_target: obj1_source, obj2_target: obj2_source} assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True # remove data from the local file cache os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt")) os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test2.txt")) shutil.rmtree(os.path.join(manifest.cache_mgr.cache_root, 'objects')) os.makedirs(os.path.join(manifest.cache_mgr.cache_root, 'objects')) with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.pull_all() assert len(result.success) == 2 assert len(result.failure) == 0 assert result.success[0].object_path != result.success[ 1].object_path assert result.success[0].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ] assert result.success[1].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ] assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True decompressor = snappy.StreamDecompressor() for r in result.success: with open(check_info[r.object_path], 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(r.object_path, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1
def test_pull_objects(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') assert len(glob.glob(f'{iom.push_dir}/*')) == 1 iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.pull_objects(keys=["test1.txt"]) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 1 assert len(result.failure) == 0 assert result.success[0].object_path == obj_to_push[0].object_path assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is False with open(obj1_target, 'rt') as dd: assert "test content 1" == dd.read() result = iom.pull_objects(keys=["test2.txt"]) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 1 assert len(result.failure) == 0 assert result.success[0].object_path == obj_to_push[1].object_path assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True with open(obj1_target, 'rt') as dd: assert "test content 1" == dd.read() with open(obj2_target, 'rt') as dd: assert "test content 2" == dd.read()
def test_download_dataset_files_file_fail( self, mock_config_file_background_tests): def dispatch_query_mock(self, job_key): # mock the job actually running and returning status JobStatus = namedtuple("JobStatus", ['status', 'meta']) return JobStatus(status='finished', meta={ 'completed_bytes': '0', 'failure_keys': 'test1.txt' }) def dispatch_mock(self, method_reference, kwargs, metadata, persist): gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs) return "afakejobkey" im = InventoryManager(mock_config_file_background_tests[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 1 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True helper_compress_file(obj1_target, obj1_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj1_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file_background_tests[0]): with patch.object(Dispatcher, 'dispatch_task', dispatch_mock): with patch.object(Dispatcher, 'query_task', dispatch_query_mock): dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"], 'config_file': mock_config_file_background_tests[0] } with pytest.raises(IOError): gtmcore.dispatcher.dataset_jobs.download_dataset_files( **dl_kwargs) assert os.path.isfile(obj1_target) is False
def test_download_dataset_files(self, mock_config_file_background_tests, mock_dataset_head): def dispatch_query_mock(self, job_key): JobStatus = namedtuple("JobStatus", ['status', 'meta']) return JobStatus(status='finished', meta={'completed_bytes': '500'}) def dispatch_mock(self, method_reference, kwargs, metadata, persist): with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs) return "afakejobkey" im = InventoryManager(mock_config_file_background_tests[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 1 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True helper_compress_file(obj1_target, obj1_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj1_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file_background_tests[0]): with patch.object(Dispatcher, 'dispatch_task', dispatch_mock): with patch.object(Dispatcher, 'query_task', dispatch_query_mock): dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"], 'config_file': mock_config_file_background_tests[0] } gtmcore.dispatcher.dataset_jobs.download_dataset_files( **dl_kwargs) assert os.path.isfile(obj1_target) is True decompressor = snappy.StreamDecompressor() with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1
def test_pull_objects(self, mock_config_file, mock_dataset_head): im = InventoryManager(mock_config_file[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) os.makedirs( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "other_dir")) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "fdsfgfd") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file[0]): with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"] } gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs) # Manually link since this is disabled by default in the job (because in real use, multiple jobs run # in parallel and you only want to link once. m.link_revision() assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is False decompressor = snappy.StreamDecompressor() with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1 # Download other file dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test2.txt"] } gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs) # Manually link since this is disabled by default in the job (because in real use, multiple jobs run # in parallel and you only want to link once. m.link_revision() assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1 with open(obj2_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj2_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1
def test_push_objects(self, mock_config_file, mock_dataset_head): im = InventoryManager(mock_config_file[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") manifest = Manifest(ds, 'default') iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj2 = obj_to_push[1].object_path.rsplit('/', 1) with aioresponses() as mocked_responses: mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}', payload={ "presigned_url": f"https://dummyurl.com/{obj1}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj1, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1", payload={}, status=200) mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}', payload={ "presigned_url": f"https://dummyurl.com/{obj2}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj2, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1", payload={}, status=200) job_kwargs = { 'objs': obj_to_push, 'logged_in_username': "******", 'access_token': "faketoken", 'id_token': "faketoken", 'dataset_owner': ds.namespace, 'dataset_name': ds.name, 'config_file': ds.client_config.config_file, } gtmcore.dispatcher.dataset_jobs.push_dataset_objects(**job_kwargs)
def test__get_pull_all_keys(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test3.txt", "dummy content") helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 3 obj3 = obj_to_push[0].object_path obj1 = obj_to_push[1].object_path obj2 = obj_to_push[2].object_path rev_dir = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision) file3 = os.path.join(rev_dir, obj_to_push[0].dataset_path) file1 = os.path.join(rev_dir, obj_to_push[1].dataset_path) file2 = os.path.join(rev_dir, obj_to_push[2].dataset_path) assert os.path.exists(obj1) is True assert os.path.exists(obj2) is True assert os.path.exists(obj3) is True assert os.path.exists(file1) is True assert os.path.exists(file2) is True assert os.path.exists(file3) is True result = iom._get_pull_all_keys() assert len(result) == 0 # Completely remove other_dir/test3.txt object os.remove(obj3) os.remove(file3) # Remove link for test1.txt, should relink automatically and not need to be pulled os.remove(file1) assert os.path.exists(obj1) is True assert os.path.exists(obj2) is True assert os.path.exists(obj3) is False assert os.path.exists(file1) is False assert os.path.exists(file2) is True assert os.path.exists(file3) is False result = iom._get_pull_all_keys() assert len(result) == 1 assert result[0] == 'other_dir/test3.txt' assert os.path.exists(obj1) is True assert os.path.exists(obj2) is True assert os.path.exists(obj3) is False assert os.path.exists(file1) is True assert os.path.exists(file2) is True assert os.path.exists(file3) is False
def test_push_objects_deduplicate(self, mock_dataset_with_manifest, mock_dataset_head): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content dup") helper_append_file(manifest.cache_mgr.cache_root, revision, "test3.txt", "test content dup") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 3 _, obj1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj2 = obj_to_push[1].object_path.rsplit('/', 1) _, obj3 = obj_to_push[2].object_path.rsplit('/', 1) assert obj1 != obj2 assert obj2 == obj3 with aioresponses() as mocked_responses: mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}', payload={ "presigned_url": f"https://dummyurl.com/{obj1}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj1, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1", headers={'Etag': 'asdfasdf'}, status=200) mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}', payload={ "presigned_url": f"https://dummyurl.com/{obj2}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj2, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1", headers={'Etag': '12341234'}, status=200) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') obj_to_push = iom.objects_to_push(remove_duplicates=True) result = iom.push_objects(obj_to_push, chunk_update_callback) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 2 assert len(result.failure) == 0 assert isinstance(result, PushResult) is True assert isinstance(result.success[0], PushObject) is True assert result.success[0].object_path != result.success[ 1].object_path assert result.success[0].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ] assert result.success[1].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ]