def test_get_exceptions(s3root, prefixes, expected): # get_many() goes via s3op, get() is a method - test both the code paths with S3() as s3: with pytest.raises(MetaflowS3AccessDenied): s3.get_many(['s3://foobar/foo']) with pytest.raises(MetaflowS3AccessDenied): s3.get('s3://foobar/foo') with S3(s3root=s3root) as s3: with pytest.raises(MetaflowS3NotFound): s3.get_many(['this_file_does_not_exist']) with pytest.raises(MetaflowS3NotFound): s3.get('this_file_does_not_exist')
def test_get_many(s3root, prefixes, expected): with S3() as s3: # 1) test the non-missing case # to test result ordering, make sure we are requesting # keys in a non-lexicographic order not_missing = [ url for url, v in expected.items() if v[None].size is not None ] urls = list(sorted(not_missing, reverse=True)) s3objs = s3.get_many(urls, return_info=True) # results should come out in the order of keys requested assert urls == [e.url for e in s3objs] assert_results(s3objs, {k: expected[k] for k in not_missing}) # 2) test with missing items, default case if not_missing != list(expected): with pytest.raises(MetaflowS3NotFound): s3objs = s3.get_many(list(expected), return_info=True) # 3) test with missing items, return_missing=True # to test result ordering, make sure we are requesting # keys in a non-lexicographic order. Missing files should # be returned in order too urls = list(sorted(expected, reverse=True)) s3objs = s3.get_many(urls, return_missing=True, return_info=True) assert urls == [e.url for e in s3objs] assert_results(s3objs, expected)
def test_get_many(s3root, prefixes, expected): def iter_objs(urls, objs): for url in urls: obj = objs[url] for r, expected in obj.items(): if r is None: yield url, None, None else: yield url, expected.range.req_offset, expected.range.req_size with S3() as s3: # 1) test the non-missing case # to test result ordering, make sure we are requesting # keys in a non-lexicographic order not_missing_urls = [ k for k, v in expected.items() if v[None].size is not None ] urls_in_order = list(sorted(not_missing_urls, reverse=True)) ranges_in_order = [] for url in urls_in_order: ranges_in_order.extend(v.range for v in expected[url].values()) objs_in_order = list( starmap(S3GetObject, iter_objs(urls_in_order, expected))) s3objs = s3.get_many(list(objs_in_order), return_info=True) fetched_urls = [] for url in urls_in_order: fetched_urls.extend([url] * len(expected[url])) # results should come out in the order of keys requested assert fetched_urls == [e.url for e in s3objs] assert_results(s3objs, expected, ranges_fetched=ranges_in_order) # 2) test with missing items, default case if not_missing_urls != list(expected.keys()): urls_in_order = list(sorted(expected.keys(), reverse=True)) ranges_in_order = [] for url in urls_in_order: ranges_in_order.extend(v.range for v in expected[url].values()) objs_in_order = list( starmap(S3GetObject, iter_objs(urls_in_order, expected))) fetched_urls = [] for url in urls_in_order: fetched_urls.extend([url] * len(expected[url])) with pytest.raises(MetaflowS3NotFound): s3objs = s3.get_many(list(objs_in_order), return_info=True) # 3) test with missing items, return_missing=True # to test result ordering, make sure we are requesting # keys in a non-lexicographic order. Missing files should # be returned in order too # Here we can use urls_in_order, ranges_in_order and objs_in_order because they # always correspond to the full set s3objs = s3.get_many(list(objs_in_order), return_missing=True, return_info=True) assert fetched_urls == [e.url for e in s3objs] assert_results(s3objs, expected, ranges_fetched=ranges_in_order)
def _do(): with S3(s3root=s3root) as s3: res = [] for key, obj in all_files: key = str(uuid4()) # New "name" every time res.append(s3.put(key, obj, overwrite=False)) return res
def _do(): with S3() as s3: res = [] for url in expected: # Use return_missing as this is the most expensive path res.append(s3.get(url, return_missing=True)) return res
def test_get_one_wo_meta(s3root, prefixes, expected): with S3() as s3: for url, item in expected.items(): for _, expected_result in item.items(): range_info = expected_result.range if expected_result.size is None: # ensure that the default return_missing=False works with pytest.raises(MetaflowS3NotFound): s3obj = s3.get( s3_get_object_from_url_range(url, range_info)) s3obj = s3.get( s3_get_object_from_url_range(url, range_info), return_missing=True, return_info=False, ) assert_results( [s3obj], {url: item}, info_should_be_empty=True, ranges_fetched=[range_info], ) else: s3obj = s3.get(s3_get_object_from_url_range( url, range_info), return_info=False) assert_results( [s3obj], {url: item}, info_should_be_empty=True, ranges_fetched=[range_info], )
def test_list_recursive(s3root, prefixes, expected): not_missing = [url for url, v in expected.items() if v[None].size is not None] with S3(s3root=s3root) as s3: s3objs = s3.list_recursive(prefixes) assert frozenset(e.url for e in s3objs) == frozenset(not_missing) # ensure that there are no duplicates assert len(s3objs) == len(not_missing) # list_recursive returns leaves only assert all(e.exists for e in s3objs)
def test_get_all_with_meta(s3root, prefixes, expected): expected_exists = { url: v for url, v in expected.items() if v[None].size is not None } for prefix in prefixes: with S3(s3root=os.path.join(s3root, prefix)) as s3: s3objs = s3.get_all(return_info=True) # results should be in lexicographic order assert list(sorted(e.url for e in s3objs)) == [e.url for e in s3objs] assert_results(s3objs, expected_exists)
def test_put_exceptions(): with S3() as s3: with pytest.raises(MetaflowS3InvalidObject): s3.put_many([('a', 1)]) with pytest.raises(MetaflowS3InvalidObject): s3.put('a', 1) with pytest.raises(MetaflowS3NotFound): s3.put_files([('a', '/non-existent/local-file')]) with pytest.raises(MetaflowS3URLException): s3.put_many([('foo', 'bar')])
def test_put_exceptions(): with S3() as s3: with pytest.raises(MetaflowS3InvalidObject): s3.put_many([("a", 1)]) with pytest.raises(MetaflowS3InvalidObject): s3.put("a", 1) with pytest.raises(MetaflowS3NotFound): s3.put_files([("a", "/non-existent/local-file")]) with pytest.raises(MetaflowS3URLException): s3.put_many([("foo", "bar")])
def test_get_one_wo_meta(s3root, prefixes, expected): with S3() as s3: for url, item in expected.items(): if item[None].size is None: # ensure that the default return_missing=False works with pytest.raises(MetaflowS3NotFound): s3obj = s3.get(url) s3obj = s3.get(url, return_missing=True, return_info=False) assert_results([s3obj], {url: expected[url]}, info_should_be_empty=True) else: s3obj = s3.get(url, return_info=False) assert_results([s3obj], {url: expected[url]}, info_should_be_empty=True)
def test_put_many(s3root, objs, expected): with S3(s3root=s3root) as s3: s3urls = s3.put_many(objs) assert list(dict(s3urls)) == list(dict(objs)) # results must be in the same order as the keys requested for i in range(len(s3urls)): assert objs[i][0] == s3urls[i][0] with S3() as s3: s3objs = s3.get_many(dict(s3urls).values()) assert_results(s3objs, expected) with S3(s3root=s3root) as s3: s3objs = s3.get_many(list(dict(objs))) assert {s3obj.key for s3obj in s3objs} == {key for key, _ in objs} # upload shuffled objs with overwrite disabled shuffled_objs = deranged_shuffle(objs) with S3(s3root=s3root) as s3: overwrite_disabled_s3urls = s3.put_many(shuffled_objs, overwrite=False) assert len(overwrite_disabled_s3urls) == 0 with S3() as s3: s3objs = s3.get_many(dict(s3urls).values()) assert_results(s3objs, expected)
def test_info_one(s3root, prefixes, expected): with S3() as s3: for url, item in expected.items(): if item[None].size is None: # ensure that the default return_missing=False works with pytest.raises(MetaflowS3NotFound): s3obj = s3.info(url) # test return_missing=True s3obj = s3.info(url, return_missing=True) assert_results([s3obj], {url: expected[url]}, info_only=True) else: s3obj = s3.info(url) assert_results([s3obj], {url: expected[url]}, info_only=True)
def test_put_one(s3root, objs, expected): with S3(s3root=s3root) as s3: for key, obj in objs: s3url = s3.put(key, obj) assert s3url in expected s3obj = s3.get(key) assert s3obj.key == key assert_results([s3obj], {s3url: expected[s3url]}) assert s3obj.blob == to_bytes(obj) # put with overwrite disabled s3url = s3.put(key, "random_value", overwrite=False) assert s3url in expected s3obj = s3.get(key) assert s3obj.key == key assert_results([s3obj], {s3url: expected[s3url]}) assert s3obj.blob == to_bytes(obj)
def test_get_recursive(s3root, prefixes, expected): expected_exists = { url: v for url, v in expected.items() if v[None].size is not None } local_files = [] with S3(s3root=s3root) as s3: s3objs = s3.get_recursive(prefixes) # we need to deduce which prefixes actually produce results nonempty_prefixes = list( filter( lambda p: any( url.startswith(os.path.join(s3root, p)) for url in expected_exists), prefixes, )) # prefixes must be returned in the order of prefixes requested plen = len(s3root) grouped = list(groupby(s3objs, lambda e: e.prefix[plen:])) assert nonempty_prefixes == [prefix for prefix, _ in grouped] # for each prefix, the results should be in lexicographic order for prefix, objs in grouped: urls = [e.url for e in objs] assert list(sorted(urls)) == urls assert_results(s3objs, expected_exists, info_should_be_empty=True) # if there are multiple prefixes, it is a bit harder to know # what's the expected set of results. We do this test only # for the single-prefix case for now if len(prefixes) == 1: [prefix] = prefixes s3root = os.path.join(s3root, prefix) keys = {url[len(s3root) + 1:] for url in expected_exists} assert {e.key for e in s3objs} == keys local_files = [s3obj.path for s3obj in s3objs] # local files must not exist outside of the S3 context for path in local_files: assert not os.path.exists(path)
def test_put_files(tempdir, s3root, blobs, expected): def _files(blobs): for blob in blobs: key = getattr(blob, 'key', blob[0]) data = getattr(blob, 'value', blob[1]) content_type = getattr(blob, 'content_type', None) metadata = getattr(blob, 'metadata', None) path = os.path.join(tempdir, key) with open(path, 'wb') as f: f.write(data) yield S3PutObject(key=key, value=path, content_type=content_type, metadata=metadata) with S3(s3root=s3root) as s3: s3urls = s3.put_files(_files(blobs)) assert list(dict(s3urls)) == list(dict(blobs)) with S3() as s3: # get urls s3objs = s3.get_many(dict(s3urls).values()) assert_results(s3objs, expected) with S3(s3root=s3root) as s3: # get keys s3objs = s3.get_many(key for key, blob in blobs) assert {s3obj.key for s3obj in s3objs} == {key for key, _ in blobs} # upload shuffled blobs with overwrite disabled shuffled_blobs = blobs[:] shuffle(shuffled_blobs) with S3(s3root=s3root) as s3: overwrite_disabled_s3urls = s3.put_files(_files(shuffled_blobs), overwrite=False) assert len(overwrite_disabled_s3urls) == 0 with S3() as s3: s3objs = s3.get_many(dict(s3urls).values()) assert_results(s3objs, expected) with S3(s3root=s3root) as s3: s3objs = s3.get_many(key for key, blob in shuffled_blobs) assert {s3obj.key for s3obj in s3objs} == {key for key, _ in shuffled_blobs}
def test_init_options(s3root, pathspecs, expected): [pathspec] = pathspecs flow_name, run_id = pathspec.split("/") plen = len(s3root) # option 1) s3root as prefix with S3(s3root=s3root) as s3: for url, exp in expected.items(): # s3root should work as a prefix s3obj = s3.get(url[plen:]) assert s3obj.key == url[plen:] assert_results([s3obj], {url: exp}) with pytest.raises(MetaflowS3URLException): s3.get("s3://some/fake/address") # option 2) full url as s3root for url, exp in expected.items(): with S3(s3root=url) as s3: s3obj = s3.get() assert_results([s3obj], {url: exp}) # option 3) full urls with S3() as s3: for url, exp in expected.items(): # s3root should work as a prefix s3obj = s3.get(url) assert s3obj.key == url assert_results([s3obj], {url: exp}) with pytest.raises(MetaflowS3URLException): s3.get("suffix") with pytest.raises(MetaflowS3URLException): s3.get("s3://nopath") with pytest.raises(MetaflowS3URLException): s3.get_many(["suffixes"]) with pytest.raises(MetaflowS3URLException): s3.get_recursive(["suffixes"]) with pytest.raises(MetaflowS3URLException): s3.get_all() # option 4) 'current' environment (fake a running flow) flow = FakeFlow(use_cli=False) parsed = urlparse(s3root) with pytest.raises(MetaflowS3URLException): # current not set yet, so this should fail with S3(run=flow): pass current._set_env( FakeFlow(name=flow_name), run_id, "no_step", "no_task", "no_origin_run_id", "no_ns", "no_user", ) with S3(bucket=parsed.netloc, prefix=parsed.path, run=flow) as s3: for url, exp in expected.items(): name = url.split("/")[-1] s3obj = s3.get(name) assert s3obj.key == name assert_results([s3obj], {url: exp}) names = [url.split("/")[-1] for url in expected] s3objs = s3.get_many(names) assert {e.key for e in s3objs} == set(names) assert_results(s3objs, expected) assert_results(s3.get_all(), expected, info_should_be_empty=True) # option 5) run object if DO_TEST_RUN: # Only works if a metadata service exists with the run in question. namespace(None) with S3(bucket=parsed.netloc, prefix=parsed.path, run=Run(pathspec)) as s3: names = [url.split("/")[-1] for url in expected] assert_results(s3.get_many(names), expected)
def _do(): with S3() as s3: # Use return_missing as this is the most expensive path res = s3.get_many(urls, return_missing=True) return res
def _do(): with S3() as s3: res = [] for url in expected: res.append(s3.info(url)) return res
def _do(): with S3() as s3: res = s3.info_many(urls) return res
def test_list_paths(s3root, prefixes, expected): def urls_by_prefix(prefix): root = os.path.join(s3root, prefix) for url, v in expected.items(): if url.startswith(root) and v[None].size is not None: yield url # 1) test that list_paths() without arguments works matches = { prefix: frozenset(urls_by_prefix(prefix)) for prefix in prefixes } non_empty = {prefix for prefix, urls in matches.items() if urls} with S3(s3root=s3root) as s3: s3objs = s3.list_paths() # found_prefixes is a subset of paths under s3root found_prefixes = [e for e in s3objs if e.key in prefixes] # we expect to find all non-empty prefixes under the s3root assert {e.key for e in found_prefixes} == non_empty # they should be all marked as non-existent objects, just prefixes assert all(not e.exists for e in found_prefixes) # they should be all marked as not downloaded assert all(not e.downloaded for e in found_prefixes) # 2) test querying by many prefixes with S3(s3root=s3root) as s3: s3objs = s3.list_paths(prefixes) assert frozenset(e.prefix.rstrip('/').split('/')[-1] for e in s3objs) == non_empty for prefix, exp in matches.items(): exists = frozenset(e.url for e in s3objs if e.prefix == prefix and e.exists) not_exists = frozenset(e.url for e in s3objs if e.prefix == prefix and not e.exists) # every object should be expected assert all(e in exp for e in exists) # not existing ones are prefixes, they shouldn't match assert all(e not in exp for e in not_exists) # 3) eventually list_paths should hit the leaf for url, v in expected.items(): if v[None].size is None: with S3() as s3: # querying a non-existent object should return # prefixes or nothing s3objs = s3.list_paths([url]) assert [e for e in s3objs if e.exists] == [] else: suffix = url[len(s3root):] expected_keys = suffix.split('/') if len(expected_keys) > 20: # speed optimization: exclude crazy long paths continue got_url = s3root for idx, expected_key in enumerate(expected_keys): with S3(s3root=got_url) as s3: s3objs = s3.list_paths() # are we at the leaf? if idx == len(expected_keys) - 1: # a leaf object should always exist [match] = [ e for e in s3objs if e.key == expected_key and e.exists ] else: # a non-leaf may match objects that are also prefixes [match] = [ e for e in s3objs if e.key == expected_key and not e.exists ] # prefix + key == url assert os.path.join(match.prefix, match.key) ==\ match.url.rstrip('/') got_url = match.url # the leaf should be the object itself assert match.url == url
def _do(): new_files = [(str(uuid4()), path) for _, path in all_files] with S3(s3root=s3root) as s3: s3urls = s3.put_files(new_files, overwrite=False) return s3urls
def test_init_options(s3root, pathspecs, expected): [pathspec] = pathspecs flow_name, run_id = pathspec.split('/') plen = len(s3root) # option 1) s3root as prefix with S3(s3root=s3root) as s3: for url, exp in expected.items(): # s3root should work as a prefix s3obj = s3.get(url[plen:]) assert s3obj.key == url[plen:] assert_results([s3obj], {url: exp}) with pytest.raises(MetaflowS3URLException): s3.get('s3://some/fake/address') # option 2) full url as s3root for url, exp in expected.items(): with S3(s3root=url) as s3: s3obj = s3.get() assert_results([s3obj], {url: exp}) # option 3) full urls with S3() as s3: for url, exp in expected.items(): # s3root should work as a prefix s3obj = s3.get(url) assert s3obj.key == url assert_results([s3obj], {url: exp}) with pytest.raises(MetaflowS3URLException): s3.get('suffix') with pytest.raises(MetaflowS3URLException): s3.get('s3://nopath') with pytest.raises(MetaflowS3URLException): s3.get_many(['suffixes']) with pytest.raises(MetaflowS3URLException): s3.get_recursive(['suffixes']) with pytest.raises(MetaflowS3URLException): s3.get_all() # option 4) 'current' environment (fake a running flow) flow = FakeFlow(use_cli=False) parsed = urlparse(s3root) with pytest.raises(MetaflowS3URLException): # current not set yet, so this should fail with S3(run=flow): pass current._set_env(flow_name, run_id, 'no_step', 'no_task', 'no_origin_run_id', 'no_ns', 'no_user') with S3(bucket=parsed.netloc, prefix=parsed.path, run=flow) as s3: for url, exp in expected.items(): name = url.split('/')[-1] s3obj = s3.get(name) assert s3obj.key == name assert_results([s3obj], {url: exp}) names = [url.split('/')[-1] for url in expected] s3objs = s3.get_many(names) assert {e.key for e in s3objs} == set(names) assert_results(s3objs, expected) assert_results(s3.get_all(), expected, info_should_be_empty=True)