Python Split 예제들, dask_io.optimizer.cases.case_config.Split Python 예제들

예제 #1

0

파일 보기

class TestConfig():
    def __init__(self, params, paths):
        self.params = params
        self.create_split_case(params, paths)
        self.create_merge_case(params, paths)

    def create_split_case(self, params, paths):
        try:
            if params["hardware"] == "hdd":
                self.hardware_path = paths["hdd_path"]
            else:
                self.hardware_path = paths["ssd_path"]
            self.cuboid_filepath = os.path.join(
                self.hardware_path, params["cuboid_name"] + ".hdf5")
            self.splitcase = Split(self.cuboid_filepath, params["chunk_shape"])
            self.splitcase.split_hdf5_multiple(self.hardware_path,
                                               nb_blocks=None)

        except Exception as e:
            print(traceback.format_exc())
            print("Something went wrong while creating case config.")
            exit(1)

    def create_merge_case(self, params, paths):
        try:
            if params["hardware"] == "hdd":
                self.hardware_path = paths["hdd_path"]
            else:
                self.hardware_path = paths["ssd_path"]

            self.merge_filepath = os.path.join(self.hardware_path,
                                               "merged.hdf5")
            self.mergecase = Merge(self.merge_filepath)
            self.mergecase.merge_hdf5_multiple(self.hardware_path,
                                               data_key='/data',
                                               store=True)

        except Exception as e:
            print(traceback.format_exc())
            print("Something went wrong while creating case config.")
            exit(1)

    def print_config(self):
        print(f'\n-------------------')
        print(f'Test configuration')
        print(f'-------------------')

        print(f'\nTest configurations:')
        print(f'\tHardware: {self.params["hardware"]}')
        print(f'\tCuboid name: {self.params["cuboid_name"]}')
        print(f'\tCuboid shape: "{self.params["array_shape"]}"')
        print(f'\tChunk shape: "{self.params["chunk_shape"]}"')
        print(f'\tChunk type: "{self.params["chunk_type"]}"')

        print(f'\nDask configuration:')
        print(f'\tOptimization enabled: {self.params["optimized"]}')
        print(f'\tBuffer size: {self.params["buffer_size"]} bytes')
        print(f'\tNb threads: {self.params["nthreads"]}')
        return

예제 #2

0

파일 보기

파일: test_clustering.py 프로젝트: big-data-lab-team/dask_io

def test_create_buffer_node():
    # preparation
    cs = (20, 20, 20)
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    graph = arr.dask.dicts
    _, dicts = get_used_proxies(graph)
    origarr_name = list(dicts['origarr_to_obj'].keys())[0]
    buffers = create_buffers(origarr_name, dicts, cs)

    # apply function
    keys = list()
    for buffer in buffers:
        key = create_buffer_node(graph, origarr_name, dicts, buffer, cs)
        keys.append(key)

    # test output
    buffers_key = origarr_name.split('-')[-1] + '-merged'

    indices = set()
    for buffer_key in graph[buffers_key].keys():
        _, start, end = buffer_key
        indices.add((start, end))

    buffers = set([(b[0], b[-1]) for b in buffers])

    assert buffers_key in graph.keys()
    assert len(indices) == len(buffers)
    assert buffers == indices

예제 #3

0

파일 보기

    def create_split_case(self, params, paths):
        try:
            if params["hardware"] == "hdd":
                self.hardware_path = paths["hdd_path"]
            else:
                self.hardware_path = paths["ssd_path"]
            self.cuboid_filepath = os.path.join(
                self.hardware_path, params["cuboid_name"] + ".hdf5")
            self.splitcase = Split(self.cuboid_filepath, params["chunk_shape"])
            self.splitcase.split_hdf5_multiple(self.hardware_path,
                                               nb_blocks=None)

        except Exception as e:
            print(traceback.format_exc())
            print("Something went wrong while creating case config.")
            exit(1)

예제 #4

0

파일 보기

def test_split_and_merge_multiple(shape_to_test, nb_chunks):
    """ TODO: add asserts -> retrieve chunks and compare to what have been stored.
    """
    fileslist = list()
    for infilepath in glob.glob("[0-9]*_[0-9]*_[0-9]*.hdf5"
                                ):  # remove split files from previous tests
        fileslist.append(infilepath)
    fileslist.append('./reconstructed.hdf5')
    for fn in fileslist:
        if os.path.isfile(fn):
            os.remove(fn)

    out_dirpath = './'
    case = Split(pytest.test_array_path, shape_to_test)
    case.split_hdf5_multiple(out_dirpath, nb_blocks=None)
    arr = case.get()
    arr.compute()
    case.clean()

    in_dirpath = out_dirpath
    case = Merge('./reconstructed.hdf5')
    case.merge_hdf5_multiple(in_dirpath)
    arr = case.get()
    arr.compute()
    case.clean()

    logger.info("Inspecting filepath: './reconstructed.hdf5'")
    with h5py.File('./reconstructed.hdf5', 'r') as f:
        inspect_h5py_file(f)
        assert f['/data'].shape == (100, 100, 100)

예제 #5

0

파일 보기

파일: test_clustering.py 프로젝트: big-data-lab-team/dask_io

def test_create_buffers_slabs():
    """ Test if the buffering works according to clustered writes when processing slabs.
    The only strategy that should be used is ``block slices".
    """
    cs = (5, 100, 100)  # 20 chunks
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    _, dicts = get_used_proxies(arr.dask.dicts)
    origarr_name = list(dicts['origarr_to_obj'].keys())[0]

    nb_bytes_per_block = 100 * 100 * 5
    byte_size = 2
    l1 = [[i] for i in range(20)]
    l2 = [list(range(10)), list(range(10, 20))]
    l3 = [list(range(7)), list(range(7, 14)), list(range(14, 20))]

    experiment_params = {
        nb_bytes_per_block * byte_size: l1,
        nb_bytes_per_block * byte_size * 10: l2,
        nb_bytes_per_block * byte_size * 7: l3
    }

    for buffer_size, expected in experiment_params.items():
        logging.info("\nTesting buffer %s", buffer_size)
        logging.debug("Expecting %s", expected)
        enable_clustering(buffer_size, mem_limit=True)
        buffers = create_buffers(origarr_name, dicts, cs)
        logging.debug("Got %s", buffers)
        assert buffers == expected

예제 #6

0

파일 보기

def split(inputfilepath, I):
    filetosplitpath = inputfilepath
    splitfilesshape = I
    case = Split(filetosplitpath, splitfilesshape)
    case.split_hdf5_multiple(
        './', nb_blocks=None)  # split all blocks into different files
    arr = case.get()
    arr.compute()
    case.clean()

예제 #7

0

파일 보기

    def split():
        # overwrite if split file already exists
        if os.path.isfile(split_filepath):
            os.remove(split_filepath)

        case = Split(pytest.test_array_path, shape_to_test)
        case.split_hdf5(split_filepath, nb_blocks=nb_chunks)
        case.get().compute()
        return

예제 #8

0

파일 보기

파일: test_find_proxies.py 프로젝트: big-data-lab-team/dask_io

def test_get_graph_from_dask():
    """ Test if it runs well.
    TODO: Better test function.
    """
    # create config for the test
    case = Split(pytest.test_array_path, "auto")
    case.sum(nb_chunks=None)
    dask_array = case.get()

    # test function
    dask_graph = dask_array.dask.dicts 
    graph = get_graph_from_dask(dask_graph, undirected=False)

예제 #9

0

파일 보기

def test_split_multiple(shape_to_test, nb_chunks):
    """ TODO: add asserts -> retrieve chunks and compare to what have been stored.
    """
    out_dirpath = './'
    case = Split(pytest.test_array_path, shape_to_test)
    case.split_hdf5_multiple(out_dirpath, nb_blocks=None)
    arr = case.get()
    # arr.visualize(filename='/tmp/dask_io_visualize_split_multiple.svg')
    arr.compute()
    case.clean()

    for filepath in glob.glob("*.hdf5"):
        logger.info("Inspecting filepath: %s", filepath)
        with h5py.File(filepath, 'r') as f:
            inspect_h5py_file(f)

예제 #10

0

파일 보기

def test_sum(shape_to_test, nb_chunks):
    """ Test if the sum of two blocks yields the good result using our optimization function.
    """
    logger.info("testing shape %s", shape_to_test)

    # prepare test case
    case = Split(pytest.test_array_path, shape_to_test)
    case.sum(nb_chunks)

    # non optimized run
    disable_clustering()
    result_non_opti = case.get().compute()

    # optimized run
    enable_clustering(buffer_size)
    result_opti = case.get().compute()

    assert np.array_equal(result_non_opti, result_opti)

예제 #11

0

파일 보기

def split(inputfilepath, I, datadir):
    """ Split the input array stored at inputfilepath into outputfiles with shape I into datadir.
    Arguments: 
    ----------
        inputfilepath: Path to the input file we want to split.
        I: Output file shape. Shape of a chunk inside each output file.
        datadir: Path to directory in which to store the output files.
    """
    print("[preprocessing] Splitting input array...")
    case = Split(inputfilepath, I)
    case.split_hdf5_multiple(datadir, nb_blocks=None)
    arr = case.get()
    buffer_shape = ONE_GIG * 5
    # enable_clustering(buffer_shape)
    with dask.config.set(scheduler='single-threaded'):
        arr.compute()
    # disable_clustering()
    case.clean()
    print(f'Split done.')

예제 #12

0

파일 보기

파일: test_clustering.py 프로젝트: big-data-lab-team/dask_io

def test_get_blocks_used():
    cs = (20, 20, 20)
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    # routine to get the needed data
    # we assume those functions have been tested before get_blocks_used
    cs_confirmed, dicts = get_used_proxies(arr.dask.dicts)

    assert cs == cs_confirmed

    origarr_name = list(dicts['origarr_to_obj'].keys())[0]
    arr_obj = dicts['origarr_to_obj'][origarr_name]
    strategy, max_blocks_per_load = get_load_strategy(ONE_GIG, cs,
                                                      (100, 100, 100))

    # actual test of the function
    blocks_used, block_to_proxies = get_blocks_used(dicts, origarr_name,
                                                    arr_obj, cs)
    blocks_used.sort()
    expected = list(range(125))
    assert blocks_used == expected

예제 #13

0

파일 보기

파일: experiment_4.py 프로젝트: GTimothee/dask_io_experiments

def split(datadir, filepath, cs, split_files=True):
    """ 
    Arguments: 
    ----------
        split_files: if true then perform a split into multiple files, if false then perform a split inside one hdf5 file
    """
    print("Splitting...")
    splitcase = Split(filepath, chunk_shapes[cs])

    if split_files:
        splitcase.split_hdf5_multiple(datadir, nb_blocks=None)
    else:
        out_filepath = os.path.join(datadir, "split.hdf5")
        splitcase.split_hdf5(out_filepath, nb_blocks=None)
    arr = splitcase.get()
    try:
        with dask.config.set(scheduler='single-threaded'):
            tsplit = run(arr)
        splitcase.clean()
        return tsplit
    except Exception as e:
        print(e, "\nOops something went wrong... Aborting.")
        splitcase.clean()
        sys.exit(1)

예제 #14

0

파일 보기

파일: test_clustering.py 프로젝트: big-data-lab-team/dask_io

def test_create_buffers_blocks():
    """ Test if the buffering works according to clustered writes in all 3 possible configurations.

    Data:
    -----
    input array shape: 100x100x100
    input arr created with 2 bytes per pixel
    block shape: 20x20x20
    
    Which gives us:
    ---------------
    - nb blocks per row = 5
    - nb blocks per slice = 25
    - block size in bytes : (20*20*20) * 2 bytes = 16000 
    """
    cs = (20, 20, 20)
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    _, dicts = get_used_proxies(arr.dask.dicts)
    origarr_name = list(dicts['origarr_to_obj'].keys())[0]

    # EXPECTED BEHAVIOR FOR CLUSTERED WRITES
    l1 = [[i] for i in range(125)]  # 1 block
    l2 = list()  # 3 blocks
    for i in range(25):
        o = (i * 5)
        l2.append([0 + o, 1 + o, 2 + o])
        l2.append([3 + o, 4 + o])
    l3 = list()  # 1 block column
    for i in range(25):
        l3.append(list(range(i * 5, i * 5 + 5)))
    l4 = list()  # 2 block columns
    for i in range(5):
        o = i * 25  # offset
        l4.append(list(range(0 + o, 10 + o)))
        l4.append(list(range(10 + o, 20 + o)))
        l4.append(list(range(20 + o, 25 + o)))
    l5 = list()  # 1 block slice
    for i in range(5):
        l5.append(list(range((i * 25), (i * 25) + 25)))
    l6 = list()  # 3 block slices
    l6.append(list(range(0, 25 * 3)))
    l6.append(list(range(75, 125)))
    l7 = [list(range(125))]  # whole array

    nb_bytes_per_block = 20 * 20 * 20
    byte_size = 2
    experiment_params = {
        nb_bytes_per_block * byte_size: l1,  # 1 block
        nb_bytes_per_block * byte_size * 3: l2,  # some blocks (3)
        nb_bytes_per_block * byte_size * 5: l3,  # 1 block column
        nb_bytes_per_block * byte_size * 5 * 2: l4,  # some block columns (2)
        nb_bytes_per_block * byte_size * 5 * 5: l5,  # 1 block slice
        nb_bytes_per_block * byte_size * 5 * 5 * 3:
        l6,  # some block slices (3)
        nb_bytes_per_block * byte_size * 5 * 5 * 5: l7,  # whole array
    }

    for buffer_size, expected in experiment_params.items():
        logging.info("\nTesting buffer %s", buffer_size)
        logging.debug("Expecting %s", expected)
        enable_clustering(buffer_size, mem_limit=True)
        buffers = create_buffers(origarr_name, dicts, cs)
        logging.debug("Got %s", buffers)
        assert buffers == expected