Exemplos de get_blocks_shape em Python, exemplos de dask_io.optimizer.cases.resplit_utils.get_blocks_shape em Python

Exemplo n.º 1

0

Exibir arquivo

def test_merge_cached_volumes():
    # prep case
    R = R_test
    B = B_test
    O = O_test
    buffers_partition = get_blocks_shape(R, B)
    buffers_volumes = get_named_volumes(buffers_partition, B)
    outfiles_partititon = get_blocks_shape(R, O)
    outfiles_volumes = get_named_volumes(outfiles_partititon, O)
    buff_to_vols = get_buff_to_vols(R, B, O, buffers_volumes,
                                    buffers_partition)
    test_arrays = get_arrays_dict(buff_to_vols, buffers_volumes,
                                  outfiles_volumes, outfiles_partititon)

    # do the merge
    merge_cached_volumes(test_arrays, volumes_to_keep_test)

    # assert
    expected = {
        0: 1,
        1: 1,  # << modified 
        2: 1,
        3: 2,
        4: 3,  # << modified
        5: 2,
        6: 1,
        7: 2,
        8: 1
    }
    test_arrays_lengths = {k: len(v) for (k, v) in test_arrays.items()}
    # logger.debug("----------After merge:")
    # neat_print(test_arrays)
    for k, v in expected.items():
        assert test_arrays_lengths[k] == v

Exemplo n.º 2

0

Exibir arquivo

def test_clean_arrays_dict():
    # prep case
    R = R_test
    B = B_test
    O = O_test
    buffers_partition = get_blocks_shape(R, B)
    buffers_volumes = get_named_volumes(buffers_partition, B)
    outfiles_partititon = get_blocks_shape(R, O)
    outfiles_volumes = get_named_volumes(outfiles_partititon, O)
    buff_to_vols = get_buff_to_vols(R, B, O, buffers_volumes,
                                    buffers_partition)
    test_arrays = get_arrays_dict(buff_to_vols, buffers_volumes,
                                  outfiles_volumes, outfiles_partititon)
    merge_cached_volumes(test_arrays, volumes_to_keep_test)

    # do the clean
    clean_arrays_dict(test_arrays)
    # logger.debug("----------After cleaning:")
    # logger.debug(test_arrays)
    for outputfile_key, expected_array_list in d_arrays_expected.items():
        arrays_list = test_arrays[outputfile_key]

        expected_array_list = list(map(lambda e: str(e), expected_array_list))
        arrays_list = list(map(lambda e: str(e), arrays_list))

        for e in expected_array_list:
            assert e in arrays_list

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_resplit_case.py Projeto: big-data-lab-team/dask_io

def test_get_dirty_arrays_dict():
    """ by dirty we mean not cleaned -> see clean function
    """                
    R = R_test 
    B = B_test 
    O = O_test 

    buffers_partition = get_blocks_shape(R, B)
    buffers_volumes = get_named_volumes(buffers_partition, B)
    outfiles_partititon = get_blocks_shape(R, O)
    outfiles_volumes = get_named_volumes(outfiles_partititon, O)
    buff_to_vols = get_buff_to_vols(R, B, O, buffers_volumes, buffers_partition)
    
    test_arrays = get_arrays_dict(buff_to_vols, buffers_volumes, outfiles_volumes) 
    test_arrays_lengths = { k: len(v) for (k, v) in test_arrays.items()}
    expected = {
        0: 1,
        1: 2,
        2: 1,
        3: 2,
        4: 4,
        5: 2, 
        6: 1, 
        7: 2, 
        8: 1
    }

    logger.debug("----------Before merge:")
    neat_print(test_arrays)

    for k, v in expected.items():
        assert test_arrays_lengths[k] == v

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_resplit_case.py Projeto: big-data-lab-team/dask_io

def test_get_buff_to_vols():
    R = R_test 
    B = B_test 
    O = O_test 

    buffers_partition = get_blocks_shape(R, B)
    buffers_volumes = get_named_volumes(buffers_partition, B)
    outfiles_partititon = get_blocks_shape(R, O)
    outfiles_volumes = get_named_volumes(outfiles_partititon, O)
    buff_to_vols = get_buff_to_vols(R, B, O, buffers_volumes, buffers_partition)

Exemplo n.º 5

0

Exibir arquivo

def test_get_volumes():
    """ test getmain and gethidden
    """
    R = (1, 120, 120)
    B = (1, 60, 60)
    O = (1, 40, 40)

    logger.debug("FUNCTION test_get_volumes ---")

    from dask_io.optimizer.utils.utils import numeric_to_3d_pos
    from dask_io.optimizer.cases.resplit_utils import get_blocks_shape

    buffers_partition = get_blocks_shape(R, B)

    for bufferindex in range(4):
        logger.debug("buffer %s", bufferindex)
        _3d_index = numeric_to_3d_pos(bufferindex,
                                      buffers_partition,
                                      order='F')
        T = list()
        for dim in range(3):
            nb = _3d_index[dim] + 1
            logger.debug("nb:%s", nb)
            C = (nb * B[dim]) % O[dim]
            if C == 0 and B[dim] != O[dim]:
                C = O[dim]
            T.append(B[dim] - C)
            logger.debug("C: %s", C)
        logger.debug("T: %s", T)

        main_volumes = get_main_volumes(B, T)
        assert len(main_volumes) == 4

Exemplo n.º 6

0

Exibir arquivo

def verify_results(outdir_path, original_array_path, R, O):
    from dask_io.optimizer.cases.resplit_utils import get_blocks_shape
    outfiles_partition = get_blocks_shape(R, O)

    all_true = True
    with h5py.File(original_array_path, 'r') as f:
        orig_arr = f["/data"]

        for i in range(outfiles_partition[0]):
            for j in range(outfiles_partition[1]):
                for k in range(outfiles_partition[2]):
                    outfilename = f"{i}_{j}_{k}.hdf5"
                    with h5py.File(os.path.join(outdir_path, outfilename),
                                   'r') as f:
                        data_stored = f["/data"]
                        print(
                            f"Slices from ground truth {i*O[0]}:{(i+1)*O[0]}, {j*O[1]}:{(j+1)*O[1]}, {k*O[2]}:{(k+1)*O[2]}"
                        )
                        ground_truth = orig_arr[i * O[0]:(i + 1) * O[0],
                                                j * O[1]:(j + 1) * O[1],
                                                k * O[2]:(k + 1) * O[2]]

                        # print(data_stored[()])
                        # print(ground_truth)
                        try:
                            assert np.allclose(data_stored[()], ground_truth)
                            print(f"Good output file {outfilename}")
                        except:
                            print(f"Error: bad rechunking {outfilename}")
                            all_true = False  # do not return here to see all failures
    return all_true

Exemplo n.º 7

0

Exibir arquivo

def verify_results_split(R, I, input_array_path, datadir):
    from dask_io.optimizer.cases.resplit_utils import get_blocks_shape
    splitfiles_partition = get_blocks_shape(R, I)
    print("split files partiton:", splitfiles_partition)

    all_true = True
    orig_arr = get_dask_array_from_hdf5(input_array_path,
                                        "/data",
                                        logic_cs=tuple(I))

    for i in range(splitfiles_partition[0]):
        for j in range(splitfiles_partition[1]):
            for k in range(splitfiles_partition[2]):
                splitfilename = f"{i}_{j}_{k}.hdf5"
                split_filepath = os.path.join(datadir, splitfilename)
                print("opening", split_filepath)
                splitarray = get_dask_array_from_hdf5(split_filepath, "/data")
                print(
                    f"Slices from ground truth {i*I[0]}:{(i+1)*I[0]}, {j*I[1]}:{(j+1)*I[1]}, {k*I[2]}:{(k+1)*I[2]}"
                )
                ground_truth_arr = orig_arr[i * I[0]:(i + 1) * I[0],
                                            j * I[1]:(j + 1) * I[1],
                                            k * I[2]:(k + 1) * I[2]]

                verify_task = da.allclose(ground_truth_arr, splitarray)
                print("VERIFY TASK: ", verify_task)
                disable_clustering()
                _res = verify_task.compute()
                print("RESULT: ", _res)
                if _res == False:
                    print(f"[Error] Split failed for {splitfilename}")
                    all_true = False

    clean_files()
    return all_true

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_resplit_case.py Projeto: big-data-lab-team/dask_io

def test_regions_dict():
    """ Given arrays_dict, does this function return the good regions_dict
    """
    logger.debug("== Function == test_regions_dict")
    R = R_test 
    O = O_test 
    outfiles_partititon = get_blocks_shape(R, O)
    outfiles_volumes = get_named_volumes(outfiles_partititon, O)

    regions_dict = get_regions_dict(d_arrays_expected, outfiles_volumes)
    for outputfile_key, expected_regions_list in regions_dict.items():
        regions_list = regions_dict[outputfile_key]
        expected_regions_list = list(map(lambda e: str(e), expected_regions_list))
        regions_list = list(map(lambda e: str(e), regions_list))

        logger.debug("Outfile n°%s", outputfile_key)
        logger.debug("Associated regions:")
        for e in regions_list:
            logger.debug("\t%s", e)

        for e in expected_regions_list:
            assert e in regions_list

Exemplo n.º 9

0

Exibir arquivo

def apply_store(B, O, R, volumestokeep, reconstructed_array):
    """ Apply store, using the keep strategy.
    """
    # creations of data for dask store function
    d_arrays, d_regions = compute_zones(B, O, R, volumestokeep)
    out_files = list()  # to keep outfiles open during processing
    sources = list()
    targets = list()
    regions = list()
    for outfile_index in sorted(d_arrays.keys()):
        sliceslistoflist = d_arrays[outfile_index]

        # create file
        outfiles_partition = get_blocks_shape(R, O)
        _3d_pos = numeric_to_3d_pos(outfile_index,
                                    outfiles_partition,
                                    order='F')
        i, j, k = _3d_pos
        out_filename = f'{i}_{j}_{k}.hdf5'
        out_file = h5py.File(os.path.join(outdir_path, out_filename), 'w')
        out_files.append(out_file)

        # create dset
        dset = out_file.create_dataset('/data', shape=O, dtype=np.float16)

        for i, st in enumerate(sliceslistoflist):
            tmp_array = reconstructed_array[st[0], st[1], st[2]]
            # print("Volume to be stored shape: ", tmp_array.shape)
            reg = d_regions[outfile_index][i]
            tmp_array = tmp_array.rechunk(tmp_array.shape)

            sources.append(tmp_array)
            targets.append(dset)
            regions.append(reg)

    return da.store(sources, targets, regions=regions,
                    compute=False), out_files

Exemplo n.º 10

0

Exibir arquivo

def check_outputs():
    # sanity check
    outfiles = list()
    for fpath in glob.glob(
            "[0-9].hdf5"):  # remove split files from previous tests
        print(f'Filename: {fpath}')
        with h5py.File(fpath, 'r') as f:
            inspect_h5py_file(f)

    # prepare ground truth for verification
    arrays_expected = dict()
    outfiles_partititon = get_blocks_shape((1, 120, 120), O)
    outfiles_volumes = get_named_volumes(outfiles_partititon, O)
    for outfilekey, volume in outfiles_volumes.items():
        slices = convert_Volume_to_slices(volume)
        arrays_expected[outfilekey] = reconstructed_array[slices[0], slices[1],
                                                          slices[2]]

    # verify
    for fpath in glob.glob("[0-9].hdf5"):
        outputfile_index = int(fpath.split('.')[0])
        print(f'Output file index: ', outputfile_index)

        array_stored = get_dask_array_from_hdf5(fpath,
                                                '/data',
                                                logic_cs="dataset_shape")
        arr_expected = arrays_expected[outputfile_index]
        print("equal:", da.allclose(array_stored, arr_expected).compute())
        print(
            "stored:", array_stored[slice(0, 1, None),
                                    slice(0, 1, None),
                                    slice(0, 10, None)].compute())
        print(
            "expected", arr_expected[slice(0, 1, None),
                                     slice(0, 1, None),
                                     slice(0, 10, None)].compute())

Exemplo n.º 11

0

Exibir arquivo

Arquivo: mem_calculator.py Projeto: GTimothee/dask_io_experiments

    import logging
    import logging.config
    logging.config.dictConfig({
        'version': 1,
        'disable_existing_loggers': True,
    })

    for case in cases:
        _type, R, O, I, B, volumestokeep = int(case["type"]), tuple(
            case["R"]), tuple(case["O"]), tuple(case["I"]), tuple(
                case["B"]), case["volumestokeep"]
        print(
            f'Current run ------ \nType: {_type}\nR: {R},\nO: {O},\nI: {I}\nvolumestokeep: {volumestokeep}'
        )

        buffers_partition = get_blocks_shape(R, B)
        buffers_volumes = get_named_volumes(buffers_partition, B)

        # find omega and theta max
        omega_max = [0, 0, 0]
        T_max = [0, 0, 0]
        for buffer_index in buffers_volumes.keys():
            _3d_index = numeric_to_3d_pos(buffer_index,
                                          buffers_partition,
                                          order='F')
            T, Cs = get_theta(buffers_volumes, buffer_index, _3d_index, O, B)

            for i in range(3):
                if Cs[i] > omega_max[i]:
                    omega_max[i] = Cs[i]
                if T[i] > T_max[i]:

Exemplo n.º 12

0

Exibir arquivo

def rechunk_vanilla_dask(indir_path, outdir_path, nthreads, R, O, model):
    """ Rechunk using vanilla dask
    """
    in_arrays = load_input_files(indir_path)

    case = Merge('samplename')
    case.merge_hdf5_multiple(indir_path, store=False)
    reconstructed_array = case.get()

    out_files = list()  # to keep outfiles open during processing
    sources = list()
    targets = list()
    outfiles_partition = get_blocks_shape(R, O)
    for i in range(outfiles_partition[0]):
        for j in range(outfiles_partition[1]):
            for k in range(outfiles_partition[2]):
                out_filename = f'{i}_{j}_{k}.hdf5'
                out_file = h5py.File(os.path.join(outdir_path, out_filename),
                                     'w')
                dset = out_file.create_dataset('/data',
                                               shape=O,
                                               dtype=np.float16)

                tmp_array = reconstructed_array[i * O[0]:(i + 1) * O[0],
                                                j * O[1]:(j + 1) * O[1],
                                                k * O[2]:(k + 1) * O[2]]
                print(
                    f'{i*O[0]}: {(i+1)*O[0]}, {j*O[1]}: {(j+1)*O[1]}, {k*O[2]}: {(k+1)*O[2]}'
                )

                out_files.append(out_file)
                sources.append(tmp_array)
                targets.append(dset)

    rechunk_task = da.store(sources, targets, compute=False)
    # rechunk_task.visualize(filename="tmp_dir/test_graph_vanilla.png")
    # sys.exit()

    with Profiler() as prof, ResourceProfiler(
            dt=0.25) as rprof, CacheProfiler() as cprof:
        scheduler = 'single-threaded' if nthreads == 1 else 'threads'

        with dask.config.set(scheduler=scheduler):
            try:
                t = time.time()
                rechunk_task.compute()
                t = time.time() - t
                # visualize([prof, rprof, cprof])
            except Exception as e:
                print(e, "\nSomething went wrong during graph execution.")
                t = None

        diagnostics = os.path.join(outdir_path, 'exp5_' + str(model) + '.html')
        visualize([prof, rprof, cprof], diagnostics, show=False)

    clean_files()

    for f in out_files:
        f.close()

    return t