def verify_results_split(R, I, input_array_path, datadir): from dask_io.optimizer.cases.resplit_utils import get_blocks_shape splitfiles_partition = get_blocks_shape(R, I) print("split files partiton:", splitfiles_partition) all_true = True orig_arr = get_dask_array_from_hdf5(input_array_path, "/data", logic_cs=tuple(I)) for i in range(splitfiles_partition[0]): for j in range(splitfiles_partition[1]): for k in range(splitfiles_partition[2]): splitfilename = f"{i}_{j}_{k}.hdf5" split_filepath = os.path.join(datadir, splitfilename) print("opening", split_filepath) splitarray = get_dask_array_from_hdf5(split_filepath, "/data") print( f"Slices from ground truth {i*I[0]}:{(i+1)*I[0]}, {j*I[1]}:{(j+1)*I[1]}, {k*I[2]}:{(k+1)*I[2]}" ) ground_truth_arr = orig_arr[i * I[0]:(i + 1) * I[0], j * I[1]:(j + 1) * I[1], k * I[2]:(k + 1) * I[2]] verify_task = da.allclose(ground_truth_arr, splitarray) print("VERIFY TASK: ", verify_task) disable_clustering() _res = verify_task.compute() print("RESULT: ", _res) if _res == False: print(f"[Error] Split failed for {splitfilename}") all_true = False clean_files() return all_true
def verify_results_merge(input_array_path, merged_array_path): original_array = get_dask_array_from_hdf5(input_array_path, "/data") merged_array = get_dask_array_from_hdf5(merged_array_path, "/data") verify_task = da.allclose(original_array, merged_array) print("VERIFY TASK: ", verify_task) disable_clustering() _res = verify_task.compute() print("RESULT: ", _res) if _res == False: print("[Error] Rechunk failed") clean_files() return _res
def test_sum(shape_to_test, nb_chunks): """ Test if the sum of two blocks yields the good result using our optimization function. """ logger.info("testing shape %s", shape_to_test) # prepare test case case = Split(pytest.test_array_path, shape_to_test) case.sum(nb_chunks) # non optimized run disable_clustering() result_non_opti = case.get().compute() # optimized run enable_clustering(buffer_size) result_opti = case.get().compute() assert np.array_equal(result_non_opti, result_opti)
def store_correct(): """ Compare the real chunks to the splits to see if correctly splitted. """ logger.info("Testing %s matches...", len(arr_list)) with h5py.File(split_filepath, 'r') as f: for i, a in enumerate(arr_list): stored_a = da.from_array(f['/data' + str(i)]) # logger.info("split shape: %s", stored_a.shape) stored_a.rechunk(chunks=shape_to_test) # logger.info("split rechunked to: %s", stored_a.shape) # logger.info("will be compared to : %s ", a.shape) # logger.info("Testing all close...") test = da.allclose(stored_a, a) disable_clustering( ) # TODO: remove this, make it work even for all close assert test.compute() logger.info("Passed.\n")
def run_test(test, paths): """ Wrapper around 'run' function for diagnostics. Arguments: ---------- test: paths: """ test.print_config() uid = uuid.uuid4() print("Test ID is ", str(uid)) params = getattr(test, 'params') splitcase = getattr(test, 'splitcase') mergecase = getattr(test, 'mergecase') if params["optimized"]: enable_clustering(params["buffer_size"]) else: disable_clustering() flush_cache() try: arr = splitcase.get() tsplit, diagnostics_split, monitor_split = run_to_hdf5( arr, params, uid, str(params["chunk_shape"]), params["optimized"]) except Exception as e: print(e) return [ params["hardware"], params["cuboid_name"], params["array_shape"], params["chunk_type"], params["chunk_shape"], params["optimized"], params["buffer_size"], params["nthreads"], None, None, None, None, None, None, None, None ] finally: splitcase.clean() R = cuboids[params["cuboid_name"]]['shape'] I = splitcase.chunks_shape print(f'R: {R}') print(f'I: {I}') if not 'auto' in I: success_run_split = verify_results_split( R, I, getattr(test, 'cuboid_filepath'), getattr(test, 'hardware_path')) else: success_run_split = None print(f'[Split] Find the diagnostics output file at {diagnostics_split}') print(f'[Split] Find the monitor output file at {monitor_split}') flush_cache() try: arr = mergecase.get() tmerge, diagnostics_merge, monitor_merge = run_to_hdf5( arr, params, uid, str(params["chunk_shape"]), params["optimized"]) except Exception as e: print(e) return [ params["hardware"], params["cuboid_name"], params["array_shape"], params["chunk_type"], params["chunk_shape"], params["optimized"], params["buffer_size"], params["nthreads"], round(tsplit, 4), None, None, None, None, None, None, None ] finally: mergecase.clean() success_run_merge = verify_results_merge(getattr(test, 'cuboid_filepath'), getattr(test, 'merge_filepath')) print(f'[Merge] Find the diagnostics output file at {diagnostics_merge}') print(f'[Merge] Find the monitor output file at {monitor_merge}') datadir = getattr(test, 'hardware_path') merged_filepath = getattr(test, 'merge_filepath') clean_directory(datadir) os.remove(merged_filepath) sample_res = [ params["hardware"], params["cuboid_name"], params["array_shape"], params["chunk_type"], params["chunk_shape"], params["optimized"], params["buffer_size"], params["nthreads"], round(tsplit, 4), round(tmerge, 4), diagnostics_split, diagnostics_merge, monitor_split, monitor_merge, success_run_split, success_run_merge ] print("-------------RESULT\n", sample_res) return sample_res
with dask.config.set(scheduler='single-threaded'): t = time.time() _ = split_arr.compute() t = time.time() - t times.append([buffer, t, "optimized"]) visualize([prof, rprof, cprof], os.path.join(output_directory, str(buffer) + "opti" + ".html"), show=False) os.remove(output_filepath) # remove output file for next run with h5py.File(output_filepath, 'x') as f_out: # open split array # run non optimized split_arr = split_to_hdf5(in_arr, f_out, nb_blocks=None) print("RUNNING NON OPTIMIZED") disable_clustering() flush_cache() with Profiler() as prof, ResourceProfiler( ) as rprof, CacheProfiler(metric=nbytes) as cprof: t = time.time() _ = split_arr.compute() t = time.time() - t times.append([buffer, t, "non optimized"]) visualize([prof, rprof, cprof], os.path.join(output_directory, str(buffer) + "nonopti" + ".html"), show=False) os.remove(output_filepath) # remove output file for next run for r in times:
def test_split(optimized, nb_chunks, shape_to_test): def create_arrays_for_comparison(): """ Get chunks as dask arrays to compare the chunks to the splitted files. """ arr = get_dask_array_from_hdf5(pytest.test_array_path, '/data', logic_cs=shape_to_test) arr_list = get_arr_chunks(arr, nb_chunks=nb_chunks) return arr_list def apply_sanity_check(split_filepath): """ Check if splitted file not empty. """ logger.info("Checking split file integrity...") with h5py.File(split_filepath, 'r') as f: keys_list = list(f.keys()) logger.info("file : %s", f) logger.info("Number of datasets in hdf5 file : %s", len(keys_list)) logger.info("First item: %s", keys_list[0]) assert len(list(f.keys())) != 0 logger.info("Integrity check passed.\n") def store_correct(): """ Compare the real chunks to the splits to see if correctly splitted. """ logger.info("Testing %s matches...", len(arr_list)) with h5py.File(split_filepath, 'r') as f: for i, a in enumerate(arr_list): stored_a = da.from_array(f['/data' + str(i)]) # logger.info("split shape: %s", stored_a.shape) stored_a.rechunk(chunks=shape_to_test) # logger.info("split rechunked to: %s", stored_a.shape) # logger.info("will be compared to : %s ", a.shape) # logger.info("Testing all close...") test = da.allclose(stored_a, a) disable_clustering( ) # TODO: remove this, make it work even for all close assert test.compute() logger.info("Passed.\n") def split(): # overwrite if split file already exists if os.path.isfile(split_filepath): os.remove(split_filepath) case = Split(pytest.test_array_path, shape_to_test) case.split_hdf5(split_filepath, nb_blocks=nb_chunks) case.get().compute() return logger.info("PARAMETERS:") logger.info("Optimized: %s", optimized), nb_chunks, shape_to_test logger.info("Nb_chunk: %s", nb_chunks) logger.info("Shape: %s \n", shape_to_test) # setup config split_filepath = "./split_file.hdf5" if optimized: enable_clustering(buffer_size) else: disable_clustering() # test split() apply_sanity_check(split_filepath) # assert arr_list = create_arrays_for_comparison() store_correct()
def test_array_path(): disable_clustering() array_filepath = './small_array_nochunk.hdf5' if not os.path.isfile(array_filepath): create_test_array_nochunk(array_filepath, (100, 100, 100)) return array_filepath