Exemplo n.º 1
0
def main():
    if len(sys.argv) == 1 or sys.argv[1] == "-h" or sys.argv[1] == "--help":
        printUsage()
        sys.exit(1)

    chunk_id = sys.argv[-1]
    if not isValidChunkId(chunk_id):
        print("Invalid chunk id")
        sys.exit(1)

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()
    session = get_session(loop=loop)

    app = {}
    app["session"] = session
    app['bucket_name'] = config.get("bucket_name")
    app['node_count'] = 1
    app['node_number'] = 0  
    app['deleted_ids'] = set()
    app['meta_cache'] = {}
    app['pending_s3_read'] = {}
    app['meta_cache'] = LruCache(mem_target=1024*1024, chunk_cache=False)
    app['chunk_cache'] = LruCache(mem_target=64*1024*1024, chunk_cache=True)
    domain = config.get("domain")
    if not domain:
        printUsage()
        sys.exit(-1)
    print("got domain:", domain)

    loop.run_until_complete(printChunkValues(app, domain, chunk_id))

    loop.close()
Exemplo n.º 2
0
    def testClearCache(self):
        """ Check LRU clear logic """
        cc = LruCache(mem_target=1024 * 1024 *
                      1024)  # big enough that there shouldn't be any cleanup
        self.assertEqual(len(cc), 0)
        ids = []
        # add chunks to the cache
        for i in range(10):
            id = createObjId("chunks")
            ids.append(id)
            arr = np.empty((16, 16), dtype='i4')  # 1024 bytes
            arr[...] = i
            cc[id] = arr
        for id in cc:
            self.assertTrue(id.startswith("c-"))
            self.assertTrue(id in ids)
        self.assertEqual(len(cc), 10)
        self.assertEqual(cc._lru_head._id, ids[-1])
        self.assertEqual(cc._lru_tail._id, ids[0])
        self.assertEqual(cc.dirtyCount, 0)
        cc.consistencyCheck()

        cc.clearCache()
        self.assertEqual(len(cc), 0)

        cc.consistencyCheck()
Exemplo n.º 3
0
def main():

    do_update = False

    if len(sys.argv) < 4:
        printUsage()

    rootid = sys.argv[1]
    prefix_old = sys.argv[2]
    prefix_new = sys.argv[3]
    if len(sys.argv) > 4 and sys.argv[4] == "-update":
        do_update = True

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)

    if prefix_old == prefix_new:
        print("prefix_old and prefix_new or the same")
        sys.exit(1)

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["prefix_old"] = prefix_old
    app["prefix_new"] = prefix_new
    app["do_update"] = do_update
    app["dataset_count"] = 0
    app["matched_dset_uri"] = 0
    app["indirect_dataset_keys"] = []
    app["loop"] = loop
    session = get_session()
    app["session"] = session
    app["filter_map"] = {}

    # need the metadata cache since we will be calling into some SN methods
    metadata_mem_cache_size = int(config.get("metadata_mem_cache_size"))
    app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size,
                                 name="MetaCache")

    loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update))

    loop.close()

    print("datsets scanned:", app["dataset_count"])
    print(
        "datasets with matching uri ('H5D_CONTIGUOUS_REF', 'H5D_CHUNKED_REF' layouts):",
        app["matched_dset_uri"])

    print("done!")
Exemplo n.º 4
0
def main():

    if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or
                                                    sys.argv[1] == "--help"):
        printUsage()

    rootid = sys.argv[1]

    if len(sys.argv) > 2 and sys.argv[2] == "-update":
        do_update = True
    else:
        do_update = False

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["loop"] = loop
    session = get_session()
    app["session"] = session
    app["filter_map"] = {}

    # need the metadata cache since we will be calling into some SN methods
    metadata_mem_cache_size = int(config.get("metadata_mem_cache_size"))
    app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size,
                                 name="MetaCache")

    loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update))

    loop.close()

    results = app["scanRoot_results"]
    datasets = results["datasets"]
    lastModified = datetime.fromtimestamp(results["lastModified"])
    print(f"lastModified: {lastModified}")
    if "md5_sum" in results:
        checksum = results["md5_sum"]
        print(f"md5_sum: {checksum}")
    print(f"metadata bytes: {results['metadata_bytes']}")
    print(f"allocated bytes: {results['allocated_bytes']}")
    print(f"logical bytes: {results['logical_bytes']}")
    print(f"num chunks: {results['num_chunks']}")
    print(f"linked chunks: {results['num_linked_chunks']}")
    print(f"linked bytes: {results['linked_bytes']}")
    print(f"num_groups: {results['num_groups']}")
    print(f"num_datatypes: {results['num_datatypes']}")
    print(f"num_datasets: {len(datasets)}")
    if datasets:
        print(
            "    dataset_id\tlast_modified\tnum_chunks\tallocated_bytes\tlogical_bytes\tlinked_bytes\tnum_link_chunks"
        )
    for dsetid in datasets:
        dataset_info = datasets[dsetid]
        lm = dataset_info['lastModified']
        nc = dataset_info['num_chunks']
        ab = dataset_info['allocated_bytes']
        lb = dataset_info['logical_bytes']
        ln = dataset_info['linked_bytes']
        nl = dataset_info['num_linked_chunks']
        print(f"   {dsetid}: {lm}, {nc}, {ab}, {lb}, {ln}, {nl}")

    scan_start = datetime.fromtimestamp(results["scan_start"])
    print(f"scan_start:    {scan_start}")
    scan_complete = datetime.fromtimestamp(results["scan_complete"])
    print(f"scan_complete: {scan_complete}")

    print("done!")
Exemplo n.º 5
0
    def testSimple(self):
        """ check basic functions by adding one chunk to cache """
        cc = LruCache(mem_target=1000 * 1000 * 10)
        cc.consistencyCheck()

        self.assertEqual(len(cc), 0)
        self.assertEqual(cc.dump_lru(), "->\n<-\n")

        self.assertFalse("xyz" in cc)

        id = createObjId("chunks")
        try:
            # only dict objects can be added
            cc[id] = list(range(20))
            self.assertTrue(False)
        except TypeError:
            pass  # expected
        arr = np.empty((16, 16), dtype='i4')
        id = createObjId("datasets")
        try:
            cc[id] = arr
            self.assertTrue(False)
        except ValueError:
            pass  # expected - not a chunk id

        rand_id = createObjId("chunks")
        np_arr = np.random.random(
            (500, 500))  # smaller than our chunk cache size
        cc[rand_id] = np_arr  # add to cache
        cc.consistencyCheck()
        self.assertEqual(len(cc), 1)
        self.assertTrue(rand_id in cc)
        lru_str = "->" + rand_id + "\n<-" + rand_id + "\n"
        mem_tgt = cc.memTarget
        self.assertEqual(mem_tgt, 1000 * 1000 * 10)
        mem_used = cc.memUsed
        self.assertEqual(mem_used, 500 * 500 * 8)
        mem_dirty = cc.memDirty
        self.assertEqual(mem_dirty, 0)
        mem_per = cc.cacheUtilizationPercent
        self.assertEqual(mem_per, 20)  # have used 20% of target memory

        # try adding the same id to the cache again
        cc[rand_id] = np_arr
        cc.consistencyCheck()
        self.assertEqual(len(cc), 1)
        self.assertTrue(rand_id in cc)

        # try out the dirty flags
        self.assertFalse(cc.isDirty(rand_id))
        self.assertEqual(cc.dirtyCount, 0)
        cc.setDirty(rand_id)
        cc.consistencyCheck()
        self.assertTrue(cc.isDirty(rand_id))
        self.assertEqual(cc.dirtyCount, 1)
        self.assertEqual(cc.dump_lru(), lru_str)
        cc.consistencyCheck()
        cc.clearDirty(rand_id)
        cc.consistencyCheck()
        self.assertFalse(cc.isDirty(rand_id))
        self.assertEqual(cc.dirtyCount, 0)
        # chunk should not have been evicted from cache
        self.assertEqual(len(cc), 1)
        self.assertTrue(rand_id in cc)
        # delete from cache
        del cc[rand_id]
        cc.consistencyCheck()
        # check cache is empty
        self.assertEqual(len(cc), 0)
        self.assertFalse(rand_id in cc)
        mem_tgt = cc.memTarget
        self.assertEqual(mem_tgt, 1000 * 1000 * 10)
        mem_used = cc.memUsed
        self.assertEqual(mem_used, 0)
        mem_dirty = cc.memDirty
        self.assertEqual(mem_dirty, 0)
        mem_per = cc.cacheUtilizationPercent
        self.assertEqual(mem_per, 0)  # no memory used
Exemplo n.º 6
0
    def testMetaDataCache(self):
        """ check metadata cache functionality """
        cc = LruCache(mem_target=1024 * 10, chunk_cache=False)
        cc.consistencyCheck()

        self.assertEqual(len(cc), 0)
        self.assertEqual(cc.dump_lru(), "->\n<-\n")

        id = createObjId("datasets")
        try:
            # only numpy arrays an be added
            cc[id] = np.zeros((3, 4))
            self.assertTrue(False)
        except TypeError:
            pass  # expected
        data = {"x": 123, "y": 456}
        arr = np.zeros((10, ))
        id = createObjId("chunks")
        try:
            cc[id] = arr
            self.assertTrue(False)
        except TypeError:
            pass  # expected - not a dict

        rand_id = createObjId("groups")
        data = {"foo": "bar"}
        cc[rand_id] = data  # add to cache
        cc.consistencyCheck()
        self.assertEqual(len(cc), 1)
        self.assertTrue(rand_id in cc)
        lru_str = "->" + rand_id + "\n<-" + rand_id + "\n"
        mem_tgt = cc.memTarget
        self.assertEqual(mem_tgt, 1024 * 10)
        mem_used = cc.memUsed
        self.assertEqual(mem_used, 1024)  # not based on actual size
        mem_per = cc.cacheUtilizationPercent
        self.assertEqual(mem_per, 10)  # have used 10% of target memory
        # try out the dirty flags
        self.assertFalse(cc.isDirty(rand_id))
        self.assertEqual(cc.dirtyCount, 0)
        cc.setDirty(rand_id)
        cc.consistencyCheck()
        self.assertTrue(cc.isDirty(rand_id))
        self.assertEqual(cc.dirtyCount, 1)
        self.assertEqual(cc.dump_lru(), lru_str)
        cc.clearDirty(rand_id)
        cc.consistencyCheck()
        self.assertFalse(cc.isDirty(rand_id))
        self.assertEqual(cc.dirtyCount, 0)
        # chunk should not have been evicted from cache
        self.assertEqual(len(cc), 1)
        self.assertTrue(rand_id in cc)
        # delete from cache
        del cc[rand_id]
        cc.consistencyCheck()
        # check cache is empty
        self.assertEqual(len(cc), 0)
        self.assertFalse(rand_id in cc)
        mem_tgt = cc.memTarget
        self.assertEqual(mem_tgt, 1024 * 10)
        mem_used = cc.memUsed
        self.assertEqual(mem_used, 0)
        mem_per = cc.cacheUtilizationPercent
        self.assertEqual(mem_per, 0)  # no memory used
Exemplo n.º 7
0
    def testMemUtil(self):
        """ Test memory usage tracks target """
        cc = LruCache(mem_target=5000)
        self.assertEqual(len(cc), 0)
        ids = set()
        for i in range(10):
            id = createObjId("chunks")
            ids.add(id)
            arr = np.empty((16, 16), dtype='i4')  # 1024 bytes
            arr[...] = i
            cc[id] = arr
            self.assertTrue(id in cc)

        cc.consistencyCheck()
        self.assertTrue(
            len(cc) <
            10)  # given mem-target, some items should have been removed
        mem_per = cc.cacheUtilizationPercent
        self.assertTrue(mem_per < 100)
        mem_dirty = cc.memDirty
        self.assertEqual(mem_dirty, 0)

        # add 10 more chunks, but set dirty to true each time
        for i in range(10):
            id = createObjId("chunks")
            ids.add(id)
            arr = np.empty((16, 16), dtype='i4')  # 1024 bytes
            arr[...] = i
            cc[id] = arr
            self.assertTrue(id in cc)
            cc.setDirty(id)
            cc.consistencyCheck()
            mem_dirty = cc.memDirty
            self.assertEqual(mem_dirty, 1024 * (i + 1))

        mem_per = cc.cacheUtilizationPercent
        # chunks are dirty so percent is over 100%
        self.assertTrue(mem_per > 100)

        # clear dirty flags (allowing memory to be released)
        id_list = []
        for id in cc:
            id_list.append(id)

        random.shuffle(id_list)  # randomize the order we clear dirty flag

        id = id_list[0]
        cc.clearDirty(id)
        cc.consistencyCheck()

        for id in id_list:
            self.assertTrue(id in ids)
            mem_dirty = cc.memDirty
            if cc.isDirty(id):
                cc.clearDirty(id)
                self.assertTrue(cc.memDirty < mem_dirty)
        mem_per = cc.cacheUtilizationPercent
        # mem percent should be less than 100 now

        self.assertTrue(mem_per <= 100)
Exemplo n.º 8
0
    def testLRU(self):
        """ Check LRU replacement logic """
        cc = LruCache(mem_target=1024 * 1024 *
                      1024)  # big enough that there shouldn't be any cleanup
        self.assertEqual(len(cc), 0)
        ids = []
        # add chunks to the cache
        for i in range(10):
            id = createObjId("chunks")
            ids.append(id)
            arr = np.empty((16, 16), dtype='i4')  # 1024 bytes
            arr[...] = i
            cc[id] = arr
        for id in cc:
            self.assertTrue(id.startswith("c-"))
            self.assertTrue(id in ids)
        self.assertEqual(len(cc), 10)
        self.assertEqual(cc._lru_head._id, ids[-1])
        self.assertEqual(cc._lru_tail._id, ids[0])
        self.assertEqual(cc.dirtyCount, 0)
        cc.consistencyCheck()

        node = cc._lru_head
        for i in range(10):
            self.assertEqual(node._id, ids[9 - i])
            node = node._next
        self.assertTrue(node is None)

        chunk_5 = ids[5]
        cc.consistencyCheck()

        np_arr = cc[chunk_5]
        self.assertEqual(np_arr[0, 0], 5)
        # the get should have moved this guy to the front
        self.assertEqual(cc._lru_head._id, chunk_5)
        for i in range(10):
            self.assertFalse(cc.isDirty(ids[i]))
        # shouldn't have effected the position
        self.assertEqual(cc._lru_head._id, chunk_5)
        # set chunk 7 to dirty
        chunk_7 = ids[7]
        cc.consistencyCheck()
        cc.setDirty(chunk_7)
        cc.consistencyCheck()
        self.assertEqual(cc.dirtyCount, 1)
        # clear dirty
        cc.clearDirty(chunk_7)
        self.assertEqual(cc.dirtyCount, 0)

        random.shuffle(ids)  # randomize the order we remove chunks
        for i in range(10):
            # remove random chunk
            chunk_id = ids[i]
            del cc[chunk_id]
            cc.consistencyCheck()
        self.assertEqual(len(cc), 0)
        self.assertEqual(cc._lru_head, None)
        self.assertEqual(cc._lru_tail, None)
        cc.consistencyCheck()
Exemplo n.º 9
0
    def testMetaDataCache(self):
        """ check metadata cache functionality """
        cc = LruCache(mem_target=1024 * 10, name="ChunkCache")
        cc.consistencyCheck()

        self.assertEqual(len(cc), 0)
        self.assertEqual(cc.dump_lru(), "->\n<-\n")

        data = {"x": 123, "y": 456}

        rand_id = createObjId("groups")
        data = {"foo": "bar"}
        cc[rand_id] = data  # add to cache
        cc.consistencyCheck()
        self.assertEqual(len(cc), 1)
        self.assertTrue(rand_id in cc)
        lru_str = "->" + rand_id + "\n<-" + rand_id + "\n"
        mem_tgt = cc.memTarget
        self.assertEqual(mem_tgt, 1024 * 10)
        mem_used = cc.memUsed
        self.assertEqual(mem_used, 1024)  # not based on actual size
        mem_per = cc.cacheUtilizationPercent
        self.assertEqual(mem_per, 10)  # have used 10% of target memory
        # try out the dirty flags
        self.assertFalse(cc.isDirty(rand_id))
        self.assertEqual(cc.dirtyCount, 0)
        cc.setDirty(rand_id)
        cc.consistencyCheck()
        self.assertTrue(cc.isDirty(rand_id))
        self.assertEqual(cc.dirtyCount, 1)
        self.assertEqual(cc.dump_lru(), lru_str)
        cc.clearDirty(rand_id)
        cc.consistencyCheck()
        self.assertFalse(cc.isDirty(rand_id))
        self.assertEqual(cc.dirtyCount, 0)
        # chunk should not have been evicted from cache
        self.assertEqual(len(cc), 1)
        self.assertTrue(rand_id in cc)
        # delete from cache
        del cc[rand_id]
        cc.consistencyCheck()
        # check cache is empty
        self.assertEqual(len(cc), 0)
        self.assertFalse(rand_id in cc)
        mem_tgt = cc.memTarget
        self.assertEqual(mem_tgt, 1024 * 10)
        mem_used = cc.memUsed
        self.assertEqual(mem_used, 0)
        mem_per = cc.cacheUtilizationPercent
        self.assertEqual(mem_per, 0)  # no memory used