def main(): if len(sys.argv) == 1 or sys.argv[1] == "-h" or sys.argv[1] == "--help": printUsage() sys.exit(1) chunk_id = sys.argv[-1] if not isValidChunkId(chunk_id): print("Invalid chunk id") sys.exit(1) # we need to setup a asyncio loop to query s3 loop = asyncio.get_event_loop() session = get_session(loop=loop) app = {} app["session"] = session app['bucket_name'] = config.get("bucket_name") app['node_count'] = 1 app['node_number'] = 0 app['deleted_ids'] = set() app['meta_cache'] = {} app['pending_s3_read'] = {} app['meta_cache'] = LruCache(mem_target=1024*1024, chunk_cache=False) app['chunk_cache'] = LruCache(mem_target=64*1024*1024, chunk_cache=True) domain = config.get("domain") if not domain: printUsage() sys.exit(-1) print("got domain:", domain) loop.run_until_complete(printChunkValues(app, domain, chunk_id)) loop.close()
def testClearCache(self): """ Check LRU clear logic """ cc = LruCache(mem_target=1024 * 1024 * 1024) # big enough that there shouldn't be any cleanup self.assertEqual(len(cc), 0) ids = [] # add chunks to the cache for i in range(10): id = createObjId("chunks") ids.append(id) arr = np.empty((16, 16), dtype='i4') # 1024 bytes arr[...] = i cc[id] = arr for id in cc: self.assertTrue(id.startswith("c-")) self.assertTrue(id in ids) self.assertEqual(len(cc), 10) self.assertEqual(cc._lru_head._id, ids[-1]) self.assertEqual(cc._lru_tail._id, ids[0]) self.assertEqual(cc.dirtyCount, 0) cc.consistencyCheck() cc.clearCache() self.assertEqual(len(cc), 0) cc.consistencyCheck()
def main(): do_update = False if len(sys.argv) < 4: printUsage() rootid = sys.argv[1] prefix_old = sys.argv[2] prefix_new = sys.argv[3] if len(sys.argv) > 4 and sys.argv[4] == "-update": do_update = True if not isValidUuid(rootid): print("Invalid root id!") sys.exit(1) if not isSchema2Id(rootid): print("This tool can only be used with Schema v2 ids") sys.exit(1) if prefix_old == prefix_new: print("prefix_old and prefix_new or the same") sys.exit(1) # we need to setup a asyncio loop to query s3 loop = asyncio.get_event_loop() app = {} app["bucket_name"] = config.get("bucket_name") app["prefix_old"] = prefix_old app["prefix_new"] = prefix_new app["do_update"] = do_update app["dataset_count"] = 0 app["matched_dset_uri"] = 0 app["indirect_dataset_keys"] = [] app["loop"] = loop session = get_session() app["session"] = session app["filter_map"] = {} # need the metadata cache since we will be calling into some SN methods metadata_mem_cache_size = int(config.get("metadata_mem_cache_size")) app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size, name="MetaCache") loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update)) loop.close() print("datsets scanned:", app["dataset_count"]) print( "datasets with matching uri ('H5D_CONTIGUOUS_REF', 'H5D_CHUNKED_REF' layouts):", app["matched_dset_uri"]) print("done!")
def main(): if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or sys.argv[1] == "--help"): printUsage() rootid = sys.argv[1] if len(sys.argv) > 2 and sys.argv[2] == "-update": do_update = True else: do_update = False if not isValidUuid(rootid): print("Invalid root id!") sys.exit(1) if not isSchema2Id(rootid): print("This tool can only be used with Schema v2 ids") sys.exit(1) # we need to setup a asyncio loop to query s3 loop = asyncio.get_event_loop() app = {} app["bucket_name"] = config.get("bucket_name") app["loop"] = loop session = get_session() app["session"] = session app["filter_map"] = {} # need the metadata cache since we will be calling into some SN methods metadata_mem_cache_size = int(config.get("metadata_mem_cache_size")) app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size, name="MetaCache") loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update)) loop.close() results = app["scanRoot_results"] datasets = results["datasets"] lastModified = datetime.fromtimestamp(results["lastModified"]) print(f"lastModified: {lastModified}") if "md5_sum" in results: checksum = results["md5_sum"] print(f"md5_sum: {checksum}") print(f"metadata bytes: {results['metadata_bytes']}") print(f"allocated bytes: {results['allocated_bytes']}") print(f"logical bytes: {results['logical_bytes']}") print(f"num chunks: {results['num_chunks']}") print(f"linked chunks: {results['num_linked_chunks']}") print(f"linked bytes: {results['linked_bytes']}") print(f"num_groups: {results['num_groups']}") print(f"num_datatypes: {results['num_datatypes']}") print(f"num_datasets: {len(datasets)}") if datasets: print( " dataset_id\tlast_modified\tnum_chunks\tallocated_bytes\tlogical_bytes\tlinked_bytes\tnum_link_chunks" ) for dsetid in datasets: dataset_info = datasets[dsetid] lm = dataset_info['lastModified'] nc = dataset_info['num_chunks'] ab = dataset_info['allocated_bytes'] lb = dataset_info['logical_bytes'] ln = dataset_info['linked_bytes'] nl = dataset_info['num_linked_chunks'] print(f" {dsetid}: {lm}, {nc}, {ab}, {lb}, {ln}, {nl}") scan_start = datetime.fromtimestamp(results["scan_start"]) print(f"scan_start: {scan_start}") scan_complete = datetime.fromtimestamp(results["scan_complete"]) print(f"scan_complete: {scan_complete}") print("done!")
def testSimple(self): """ check basic functions by adding one chunk to cache """ cc = LruCache(mem_target=1000 * 1000 * 10) cc.consistencyCheck() self.assertEqual(len(cc), 0) self.assertEqual(cc.dump_lru(), "->\n<-\n") self.assertFalse("xyz" in cc) id = createObjId("chunks") try: # only dict objects can be added cc[id] = list(range(20)) self.assertTrue(False) except TypeError: pass # expected arr = np.empty((16, 16), dtype='i4') id = createObjId("datasets") try: cc[id] = arr self.assertTrue(False) except ValueError: pass # expected - not a chunk id rand_id = createObjId("chunks") np_arr = np.random.random( (500, 500)) # smaller than our chunk cache size cc[rand_id] = np_arr # add to cache cc.consistencyCheck() self.assertEqual(len(cc), 1) self.assertTrue(rand_id in cc) lru_str = "->" + rand_id + "\n<-" + rand_id + "\n" mem_tgt = cc.memTarget self.assertEqual(mem_tgt, 1000 * 1000 * 10) mem_used = cc.memUsed self.assertEqual(mem_used, 500 * 500 * 8) mem_dirty = cc.memDirty self.assertEqual(mem_dirty, 0) mem_per = cc.cacheUtilizationPercent self.assertEqual(mem_per, 20) # have used 20% of target memory # try adding the same id to the cache again cc[rand_id] = np_arr cc.consistencyCheck() self.assertEqual(len(cc), 1) self.assertTrue(rand_id in cc) # try out the dirty flags self.assertFalse(cc.isDirty(rand_id)) self.assertEqual(cc.dirtyCount, 0) cc.setDirty(rand_id) cc.consistencyCheck() self.assertTrue(cc.isDirty(rand_id)) self.assertEqual(cc.dirtyCount, 1) self.assertEqual(cc.dump_lru(), lru_str) cc.consistencyCheck() cc.clearDirty(rand_id) cc.consistencyCheck() self.assertFalse(cc.isDirty(rand_id)) self.assertEqual(cc.dirtyCount, 0) # chunk should not have been evicted from cache self.assertEqual(len(cc), 1) self.assertTrue(rand_id in cc) # delete from cache del cc[rand_id] cc.consistencyCheck() # check cache is empty self.assertEqual(len(cc), 0) self.assertFalse(rand_id in cc) mem_tgt = cc.memTarget self.assertEqual(mem_tgt, 1000 * 1000 * 10) mem_used = cc.memUsed self.assertEqual(mem_used, 0) mem_dirty = cc.memDirty self.assertEqual(mem_dirty, 0) mem_per = cc.cacheUtilizationPercent self.assertEqual(mem_per, 0) # no memory used
def testMetaDataCache(self): """ check metadata cache functionality """ cc = LruCache(mem_target=1024 * 10, chunk_cache=False) cc.consistencyCheck() self.assertEqual(len(cc), 0) self.assertEqual(cc.dump_lru(), "->\n<-\n") id = createObjId("datasets") try: # only numpy arrays an be added cc[id] = np.zeros((3, 4)) self.assertTrue(False) except TypeError: pass # expected data = {"x": 123, "y": 456} arr = np.zeros((10, )) id = createObjId("chunks") try: cc[id] = arr self.assertTrue(False) except TypeError: pass # expected - not a dict rand_id = createObjId("groups") data = {"foo": "bar"} cc[rand_id] = data # add to cache cc.consistencyCheck() self.assertEqual(len(cc), 1) self.assertTrue(rand_id in cc) lru_str = "->" + rand_id + "\n<-" + rand_id + "\n" mem_tgt = cc.memTarget self.assertEqual(mem_tgt, 1024 * 10) mem_used = cc.memUsed self.assertEqual(mem_used, 1024) # not based on actual size mem_per = cc.cacheUtilizationPercent self.assertEqual(mem_per, 10) # have used 10% of target memory # try out the dirty flags self.assertFalse(cc.isDirty(rand_id)) self.assertEqual(cc.dirtyCount, 0) cc.setDirty(rand_id) cc.consistencyCheck() self.assertTrue(cc.isDirty(rand_id)) self.assertEqual(cc.dirtyCount, 1) self.assertEqual(cc.dump_lru(), lru_str) cc.clearDirty(rand_id) cc.consistencyCheck() self.assertFalse(cc.isDirty(rand_id)) self.assertEqual(cc.dirtyCount, 0) # chunk should not have been evicted from cache self.assertEqual(len(cc), 1) self.assertTrue(rand_id in cc) # delete from cache del cc[rand_id] cc.consistencyCheck() # check cache is empty self.assertEqual(len(cc), 0) self.assertFalse(rand_id in cc) mem_tgt = cc.memTarget self.assertEqual(mem_tgt, 1024 * 10) mem_used = cc.memUsed self.assertEqual(mem_used, 0) mem_per = cc.cacheUtilizationPercent self.assertEqual(mem_per, 0) # no memory used
def testMemUtil(self): """ Test memory usage tracks target """ cc = LruCache(mem_target=5000) self.assertEqual(len(cc), 0) ids = set() for i in range(10): id = createObjId("chunks") ids.add(id) arr = np.empty((16, 16), dtype='i4') # 1024 bytes arr[...] = i cc[id] = arr self.assertTrue(id in cc) cc.consistencyCheck() self.assertTrue( len(cc) < 10) # given mem-target, some items should have been removed mem_per = cc.cacheUtilizationPercent self.assertTrue(mem_per < 100) mem_dirty = cc.memDirty self.assertEqual(mem_dirty, 0) # add 10 more chunks, but set dirty to true each time for i in range(10): id = createObjId("chunks") ids.add(id) arr = np.empty((16, 16), dtype='i4') # 1024 bytes arr[...] = i cc[id] = arr self.assertTrue(id in cc) cc.setDirty(id) cc.consistencyCheck() mem_dirty = cc.memDirty self.assertEqual(mem_dirty, 1024 * (i + 1)) mem_per = cc.cacheUtilizationPercent # chunks are dirty so percent is over 100% self.assertTrue(mem_per > 100) # clear dirty flags (allowing memory to be released) id_list = [] for id in cc: id_list.append(id) random.shuffle(id_list) # randomize the order we clear dirty flag id = id_list[0] cc.clearDirty(id) cc.consistencyCheck() for id in id_list: self.assertTrue(id in ids) mem_dirty = cc.memDirty if cc.isDirty(id): cc.clearDirty(id) self.assertTrue(cc.memDirty < mem_dirty) mem_per = cc.cacheUtilizationPercent # mem percent should be less than 100 now self.assertTrue(mem_per <= 100)
def testLRU(self): """ Check LRU replacement logic """ cc = LruCache(mem_target=1024 * 1024 * 1024) # big enough that there shouldn't be any cleanup self.assertEqual(len(cc), 0) ids = [] # add chunks to the cache for i in range(10): id = createObjId("chunks") ids.append(id) arr = np.empty((16, 16), dtype='i4') # 1024 bytes arr[...] = i cc[id] = arr for id in cc: self.assertTrue(id.startswith("c-")) self.assertTrue(id in ids) self.assertEqual(len(cc), 10) self.assertEqual(cc._lru_head._id, ids[-1]) self.assertEqual(cc._lru_tail._id, ids[0]) self.assertEqual(cc.dirtyCount, 0) cc.consistencyCheck() node = cc._lru_head for i in range(10): self.assertEqual(node._id, ids[9 - i]) node = node._next self.assertTrue(node is None) chunk_5 = ids[5] cc.consistencyCheck() np_arr = cc[chunk_5] self.assertEqual(np_arr[0, 0], 5) # the get should have moved this guy to the front self.assertEqual(cc._lru_head._id, chunk_5) for i in range(10): self.assertFalse(cc.isDirty(ids[i])) # shouldn't have effected the position self.assertEqual(cc._lru_head._id, chunk_5) # set chunk 7 to dirty chunk_7 = ids[7] cc.consistencyCheck() cc.setDirty(chunk_7) cc.consistencyCheck() self.assertEqual(cc.dirtyCount, 1) # clear dirty cc.clearDirty(chunk_7) self.assertEqual(cc.dirtyCount, 0) random.shuffle(ids) # randomize the order we remove chunks for i in range(10): # remove random chunk chunk_id = ids[i] del cc[chunk_id] cc.consistencyCheck() self.assertEqual(len(cc), 0) self.assertEqual(cc._lru_head, None) self.assertEqual(cc._lru_tail, None) cc.consistencyCheck()
def testMetaDataCache(self): """ check metadata cache functionality """ cc = LruCache(mem_target=1024 * 10, name="ChunkCache") cc.consistencyCheck() self.assertEqual(len(cc), 0) self.assertEqual(cc.dump_lru(), "->\n<-\n") data = {"x": 123, "y": 456} rand_id = createObjId("groups") data = {"foo": "bar"} cc[rand_id] = data # add to cache cc.consistencyCheck() self.assertEqual(len(cc), 1) self.assertTrue(rand_id in cc) lru_str = "->" + rand_id + "\n<-" + rand_id + "\n" mem_tgt = cc.memTarget self.assertEqual(mem_tgt, 1024 * 10) mem_used = cc.memUsed self.assertEqual(mem_used, 1024) # not based on actual size mem_per = cc.cacheUtilizationPercent self.assertEqual(mem_per, 10) # have used 10% of target memory # try out the dirty flags self.assertFalse(cc.isDirty(rand_id)) self.assertEqual(cc.dirtyCount, 0) cc.setDirty(rand_id) cc.consistencyCheck() self.assertTrue(cc.isDirty(rand_id)) self.assertEqual(cc.dirtyCount, 1) self.assertEqual(cc.dump_lru(), lru_str) cc.clearDirty(rand_id) cc.consistencyCheck() self.assertFalse(cc.isDirty(rand_id)) self.assertEqual(cc.dirtyCount, 0) # chunk should not have been evicted from cache self.assertEqual(len(cc), 1) self.assertTrue(rand_id in cc) # delete from cache del cc[rand_id] cc.consistencyCheck() # check cache is empty self.assertEqual(len(cc), 0) self.assertFalse(rand_id in cc) mem_tgt = cc.memTarget self.assertEqual(mem_tgt, 1024 * 10) mem_used = cc.memUsed self.assertEqual(mem_used, 0) mem_per = cc.cacheUtilizationPercent self.assertEqual(mem_per, 0) # no memory used