def main(): if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or sys.argv[1] == "--help"): printUsage() rootid = sys.argv[1] if not isValidUuid(rootid): print("Invalid root id!") sys.exit(1) if not isSchema2Id(rootid): print("This tool can only be used with Schema v2 ids") sys.exit(1) # we need to setup a asyncio loop to query s3 loop = asyncio.get_event_loop() app = {} app["bucket_name"] = config.get("bucket_name") app["loop"] = loop session = get_session(loop=loop) app["session"] = session loop.run_until_complete(run_delete(app, rootid)) loop.close() print("done!")
def main(): do_update = False if len(sys.argv) < 4: printUsage() rootid = sys.argv[1] prefix_old = sys.argv[2] prefix_new = sys.argv[3] if len(sys.argv) > 4 and sys.argv[4] == "-update": do_update = True if not isValidUuid(rootid): print("Invalid root id!") sys.exit(1) if not isSchema2Id(rootid): print("This tool can only be used with Schema v2 ids") sys.exit(1) if prefix_old == prefix_new: print("prefix_old and prefix_new or the same") sys.exit(1) # we need to setup a asyncio loop to query s3 loop = asyncio.get_event_loop() app = {} app["bucket_name"] = config.get("bucket_name") app["prefix_old"] = prefix_old app["prefix_new"] = prefix_new app["do_update"] = do_update app["dataset_count"] = 0 app["matched_dset_uri"] = 0 app["indirect_dataset_keys"] = [] app["loop"] = loop session = get_session() app["session"] = session app["filter_map"] = {} # need the metadata cache since we will be calling into some SN methods metadata_mem_cache_size = int(config.get("metadata_mem_cache_size")) app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size, name="MetaCache") loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update)) loop.close() print("datsets scanned:", app["dataset_count"]) print( "datasets with matching uri ('H5D_CONTIGUOUS_REF', 'H5D_CHUNKED_REF' layouts):", app["matched_dset_uri"]) print("done!")
def main(): if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or sys.argv[1] == "--help"): printUsage() rootid = sys.argv[1] if len(sys.argv) > 2 and sys.argv[2] == "-update": do_update = True else: do_update = False if not isValidUuid(rootid): print("Invalid root id!") sys.exit(1) if not isSchema2Id(rootid): print("This tool can only be used with Schema v2 ids") sys.exit(1) # we need to setup a asyncio loop to query s3 loop = asyncio.get_event_loop() app = {} app["bucket_name"] = config.get("bucket_name") app["loop"] = loop session = get_session(loop=loop) app["session"] = session loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update)) loop.close() results = app["scanRoot_results"] datasets = results["datasets"] lastModified = datetime.fromtimestamp(results["lastModified"]) total_size = results["metadata_bytes"] + results["allocated_bytes"] print(f"lastModified: {lastModified}") print(f"size: {total_size}") print(f"num chunks: {results['num_chunks']}") print(f"num_groups: {results['num_groups']}") print(f"num_datatypes: {results['num_datatypes']}") print(f"num_datasets: {len(datasets)}") for dsetid in datasets: dataset_info = datasets[dsetid] print(f" {dsetid}: {dataset_info['lastModified']}, {dataset_info['num_chunks']}, {dataset_info['allocated_bytes']}") scan_start = datetime.fromtimestamp(results["scan_start"]) print(f"scan_start: {scan_start}") scan_complete = datetime.fromtimestamp(results["scan_complete"]) print(f"scan_complete: {scan_complete}") print("done!")
def testIsValidUuid(self): group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" chunk_id = "c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2" domain_id = "mybucket/bob/mydata.h5" valid_ids = (group_id, dataset_id, ctype_id, chunk_id, domain_id) bad_ids = ("g-1e76d862", "/bob/mydata.h5") self.assertTrue(isValidUuid(group_id)) self.assertFalse(isSchema2Id(group_id)) self.assertTrue(isValidUuid(group_id, obj_class="Group")) self.assertTrue(isValidUuid(group_id, obj_class="group")) self.assertTrue(isValidUuid(group_id, obj_class="groups")) self.assertTrue(isValidUuid(dataset_id, obj_class="datasets")) self.assertFalse(isSchema2Id(dataset_id)) self.assertTrue(isValidUuid(ctype_id, obj_class="datatypes")) self.assertFalse(isSchema2Id(ctype_id)) self.assertTrue(isValidUuid(chunk_id, obj_class="chunks")) self.assertFalse(isSchema2Id(chunk_id)) validateUuid(group_id) try: isRootObjId(group_id) self.assertTrue(False) except ValueError: # only works for v2 schema pass # expected for item in valid_ids: self.assertTrue(isObjId(item)) s3key = getS3Key(item) self.assertTrue(s3key[0] != '/') self.assertTrue(isS3ObjKey(s3key)) if item.find('/') > 0: continue # bucket name gets lost when domain ids get converted to s3keys objid = getObjId(s3key) self.assertEqual(objid, item) for item in bad_ids: self.assertFalse(isValidUuid(item)) self.assertFalse(isObjId(item))
def testSchema2Id(self): root_id = createObjId("roots") group_id = createObjId("groups", rootid=root_id) dataset_id = createObjId("datasets", rootid=root_id) ctype_id = createObjId("datatypes", rootid=root_id) self.assertEqual(getCollectionForId(root_id), "groups") self.assertEqual(getCollectionForId(group_id), "groups") self.assertEqual(getCollectionForId(dataset_id), "datasets") self.assertEqual(getCollectionForId(ctype_id), "datatypes") chunk_id = 'c' + dataset_id[1:] + "_1_2" print(chunk_id) chunk_partition_id = 'c42-' + dataset_id[2:] + "_1_2" for id in (chunk_id, chunk_partition_id): try: getCollectionForId(id) self.assertTrue(False) except ValueError: pass # expected valid_ids = (group_id, dataset_id, ctype_id, chunk_id, chunk_partition_id, root_id) s3prefix = getS3Key(root_id) self.assertTrue(s3prefix.endswith("/.group.json")) s3prefix = s3prefix[:-(len(".group.json"))] for oid in valid_ids: print("oid:", oid) self.assertTrue(len(oid) >= 38) parts = oid.split('-') self.assertEqual(len(parts), 6) self.assertTrue(oid[0] in ('g', 'd', 't', 'c')) self.assertTrue(isSchema2Id(oid)) if oid == root_id: self.assertTrue(isRootObjId(oid)) else: self.assertFalse(isRootObjId(oid)) self.assertEqual(getRootObjId(oid), root_id) s3key = getS3Key(oid) print(s3key) self.assertTrue(s3key.startswith(s3prefix)) self.assertEqual(getObjId(s3key), oid) self.assertTrue(isS3ObjKey(s3key))
def main(): if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or sys.argv[1] == "--help"): printUsage() rootid = sys.argv[1] if len(sys.argv) > 2 and sys.argv[2] == "-update": do_update = True else: do_update = False if not isValidUuid(rootid): print("Invalid root id!") sys.exit(1) if not isSchema2Id(rootid): print("This tool can only be used with Schema v2 ids") sys.exit(1) # we need to setup a asyncio loop to query s3 loop = asyncio.get_event_loop() app = {} app["bucket_name"] = config.get("bucket_name") app["loop"] = loop session = get_session() app["session"] = session app["filter_map"] = {} # need the metadata cache since we will be calling into some SN methods metadata_mem_cache_size = int(config.get("metadata_mem_cache_size")) app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size, name="MetaCache") loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update)) loop.close() results = app["scanRoot_results"] datasets = results["datasets"] lastModified = datetime.fromtimestamp(results["lastModified"]) print(f"lastModified: {lastModified}") if "md5_sum" in results: checksum = results["md5_sum"] print(f"md5_sum: {checksum}") print(f"metadata bytes: {results['metadata_bytes']}") print(f"allocated bytes: {results['allocated_bytes']}") print(f"logical bytes: {results['logical_bytes']}") print(f"num chunks: {results['num_chunks']}") print(f"linked chunks: {results['num_linked_chunks']}") print(f"linked bytes: {results['linked_bytes']}") print(f"num_groups: {results['num_groups']}") print(f"num_datatypes: {results['num_datatypes']}") print(f"num_datasets: {len(datasets)}") if datasets: print( " dataset_id\tlast_modified\tnum_chunks\tallocated_bytes\tlogical_bytes\tlinked_bytes\tnum_link_chunks" ) for dsetid in datasets: dataset_info = datasets[dsetid] lm = dataset_info['lastModified'] nc = dataset_info['num_chunks'] ab = dataset_info['allocated_bytes'] lb = dataset_info['logical_bytes'] ln = dataset_info['linked_bytes'] nl = dataset_info['num_linked_chunks'] print(f" {dsetid}: {lm}, {nc}, {ab}, {lb}, {ln}, {nl}") scan_start = datetime.fromtimestamp(results["scan_start"]) print(f"scan_start: {scan_start}") scan_complete = datetime.fromtimestamp(results["scan_complete"]) print(f"scan_complete: {scan_complete}") print("done!")