def keep_reading(B, I, R):
    """
    b: read buffer shape
    i: inblock shape
    r: original image shape
    """

    buffer_partition = get_blocks_shape(R, B)
    read_buffers = get_named_volumes(buffer_partition, B)

    infiles_partition = get_blocks_shape(R, I)
    inblocks = get_named_volumes(infiles_partition, I)

    nb_inblocks_openings = 0
    nb_inblocks_seeks = 0

    for buffer_index in sorted(read_buffers.keys()):
        read_buffer = read_buffers[buffer_index]

        for inblock in inblocks.values():
            if hypercubes_overlap(read_buffer, inblock):
                nb_inblock_seeks_tmp = write_buffer(read_buffer, inblock, I)
                nb_inblocks_seeks += nb_inblock_seeks_tmp
                nb_inblocks_openings += 1

    print(f"[Reality] Number inblocks opening: {nb_inblocks_openings}")
    print(f"[Reality] Number inblocks seeks: {nb_inblocks_seeks}")
    return nb_inblocks_openings + nb_inblocks_seeks
Пример #2
0
def baseline_rechunk(O, I, R):
    """
    Arguments: 
    ----------
        O, I, R: tuples
    """

    infiles_partition = get_blocks_shape(R, I)
    inblocks = get_named_volumes(infiles_partition, I)

    outfiles_partition = get_blocks_shape(R, O)
    outblocks = get_named_volumes(outfiles_partition, O)

    nb_infile_openings = 0
    nb_infile_seeks = 0
    nb_outfile_openings = 0
    nb_outfile_seeks = 0

    for buffer_index in sorted(inblocks.keys()):
        read_buffer = inblocks[buffer_index]
        nb_infile_openings += 1

        for outblock in outblocks.values():
            if hypercubes_overlap(read_buffer, outblock):
                nb_outfile_seeks_tmp = write_buffer(read_buffer, outblock, O)
                nb_outfile_seeks += nb_outfile_seeks_tmp
                nb_outfile_openings += 1

    return [
        nb_outfile_openings, nb_outfile_seeks, nb_infile_openings,
        nb_infile_seeks
    ]
Пример #3
0
def create_case(args):
    paths = load_json(args.paths_config)

    for k, v in paths.items():
        if "PYTHONPATH" in k:
            sys.path.insert(0, v)

    from repartition_experiments.scripts_exp.exp_utils import create_empty_dir, create_input_chunks, create_input_chunks_distributed
    from repartition_experiments.algorithms.clustered_writes import clustered_writes
    from repartition_experiments.algorithms.utils import get_file_manager, get_blocks_shape

    # preprocessing
    fm = get_file_manager(args.file_format)
    R_stringlist, I_stringlist = args.R.split('_'), args.I.split('_')
    R, I = tuple(map(lambda e: int(e), R_stringlist)), tuple(map(lambda e: int(e), I_stringlist))
    print(R, I)
    indir_path, outdir_path = os.path.join(paths["ssd_path"], 'indir'), os.path.join(paths["ssd_path"], 'outdir')
    partition = get_blocks_shape(R, I)

    if args.distributed:  # only creates the input blocks, without creating the big image first and splitting it, and stores each chunk in a rounding fashion on the different disks of the cluster
        create_input_chunks_distributed(I, partition, indir_path, args.file_format)
        return

    if not args.splits_only: # creating input image and then splitting it.
        origarr_filepath = create_input_file(R, paths["ssd_path"], fm)
        print("creating input file...", origarr_filepath)
        bpv = 2
        R_size = R[0]*R[1]*R[2]*bpv
        create_empty_dir(indir_path)
        create_empty_dir(outdir_path)
        clustered_writes(origarr_filepath, R, I, bpv, R_size, args.file_format, indir_path)
    else:  # only creates the input blocks, without creating the big image first and splitting it
        create_input_chunks(I, partition, indir_path, args.file_format)
Пример #4
0
def verify_results(outdir_path, original_array_path, R, O, file_format, addition, split_merge=False):
    """ Compare content of each output file against expected subarrays from original array.
    WARNING: this function opens all output files + the original array
    """

    if file_format == "HDF5":
        file_manager = HDF5_manager()
    else:
        print("File format not supported yet. Aborting...")
        sys.exit(1)

    partition = get_blocks_shape(R, O)
    orig_arr_data = file_manager.read_all(original_array_path)
    all_true = True

    if split_merge:
        result_arrpath = os.path.join(outdir_path, "0_0_0.hdf5")
        return file_manager.check_split_merge(original_array_path, result_arrpath)

    for i in range(partition[0]):
        for j in range(partition[1]):
            for k in range(partition[2]):
                outfilepath = os.path.join(outdir_path, str(i) + "_" + str(j) + "_" + str(k) + ".hdf5")
                data_stored = file_manager.read_all(outfilepath)
                ground_truth = orig_arr_data[i*O[0]:(i+1)*O[0],j*O[1]:(j+1)*O[1],k*O[2]:(k+1)*O[2]]
                
                if addition:
                    ground_truth = ground_truth +1

                try:
                    assert np.allclose(data_stored, ground_truth, rtol=1e-02)
                    # print(f"Good output file {outfilepath}")
                except:
                    print(f"Error: bad rechunking {outfilepath}")
                    print(f"Slices from ground truth {i*O[0]}:{(i+1)*O[0]}, {j*O[1]}:{(j+1)*O[1]}, {k*O[2]}:{(k+1)*O[2]}")
                    print("data_stored", data_stored)
                    print("ground_truth", ground_truth)
                    all_true = False  # do not return here to see all failures

    file_manager.close_infiles()  # close all files
    return all_true
def clustered_writes(origarr_filepath, R, cs, bpv, m, ff, outdir_path):
    """ Implementation of the clustered strategy for splitting a 3D array.
    Output file names are following the following regex: outdir_path/{i}_{j}_{k}.extension
    WARNING: this implementation loads the whole input array in RAM. We had 250GB of RAM for our experiments so we decided to use it.

    Arguments: 
    ----------
        R: original array shape
        m: memory available for the buffer
        cs: chunk shape
        bpv: number of bytes per voxel
        ff: file_format
        outdir_path: where to write the splits
    """

    strategies = {
        0: "blocks",
        1: "block_rows",
        2: "block_slices"
    }
    
    file_manager = get_file_manager(ff)

    partition = get_blocks_shape(R, cs)
    bs, brs, bss = get_entity_sizes(cs, bpv, partition)
    strategy = get_strategy(m, bs, brs, bss)

    origarr_size = R[0] * R[1] * R[2] * bpv
    buffers = compute_buffers(m, strategy, origarr_size, cs, bs, brs, bss, partition, R, bpv)

    origarr = file_manager.get_dataset(origarr_filepath, '/data')
    for buffer_index in range(len(buffers.values())):
        buffer = buffers[buffer_index]
        buffer_data = read_buffer(origarr, file_manager, buffer)
        write_splits(file_manager, buffer, buffer_data, cs, outdir_path)

    file_manager.close_infiles()
    get_opened_files()
def write_splits(file_manager, buffer, buffer_data, cs, outdir_path):
    p1, p2 = buffer.get_corners()
    first_index = (p1[0]/cs[0], p1[1]/cs[1], p1[2]/cs[2])
    buffer_shape = (p2[0]-p1[0], p2[1]-p1[1], p2[2]-p1[2])
    buff_partition = get_blocks_shape(buffer_shape, cs)

    _3d_index = first_index
    for i in range(buff_partition[0]):
        for j in range(buff_partition[1]):
            for k in range(buff_partition[2]):
                split_data = buffer_data[
                    i * cs[0]:(i+1) * cs[0], 
                    j * cs[1]:(j+1) * cs[1], 
                    k * cs[2]:(k+1) * cs[2]]

                region = ((0, cs[0]), (0, cs[1]), (0, cs[2]))
                file_manager.write_data(int(_3d_index[0] + i), 
                                        int(_3d_index[1] + j), 
                                        int(_3d_index[2] + k), 
                                        outdir_path, 
                                        split_data, 
                                        region, 
                                        cs)
Пример #7
0
def baseline_rechunk(indir_path,
                     outdir_path,
                     O,
                     I,
                     R,
                     file_format,
                     addition,
                     distributed,
                     debug_mode=False,
                     clean_out_dir=False,
                     dont_write=False):
    """ Naive rechunk implementation in plain python.
    The input directory is supposed to contain the input files (output of the split process).
    WARNING: Does not clean the output directory after use by default.
    """

    print(f"Setting arguments...")
    global DEBUG_LOCAL
    global DONT_WRITE
    global tracker
    global outdirs_dict, outdir_index
    outdirs_dict = dict()
    outdir_index = 0
    tracker = Tracker()
    DEBUG_LOCAL = True if debug_mode else False
    DONT_WRITE = True if dont_write else False

    print("Addition mode:", addition)
    print("DONT_WRITE: ", DONT_WRITE)

    O, I, R = tuple(O), tuple(I), tuple(R)

    file_manager = get_file_manager(file_format)

    infiles_partition = get_blocks_shape(R, I)
    infiles_volumes = get_named_volumes(infiles_partition, I)
    outfiles_partition = get_blocks_shape(R, O)
    outfiles_volumes = get_named_volumes(outfiles_partition, O)
    outfiles_volumes = outfiles_volumes.values()

    if distributed:
        repartition_dict = None

        json_filename = '/disk0/gtimothee/repartition_dict.json'
        if not os.path.isfile(json_filename):
            # print("cannot find association dict json file")
            sys.exit(1)
        else:
            pass  # print(f"json file found")

        try:
            with open(json_filename) as f:
                repartition_dict = json.load(f)
        except Exception as e:
            print(e)
            # print("error (1)")
            sys.exit(1)

        if repartition_dict == None:
            # print("error (2)")
            sys.exit(1)
        else:
            pass  # print(f"Found reparition dict: {repartition_dict}")

        input_files = repartition_dict.values()
    else:
        input_files = file_manager.get_input_files(indir_path)

    t_read = 0
    t_write = 0

    vols_written = list()
    nb_infile_openings = 0
    nb_infile_seeks = 0
    nb_outfile_openings = 0
    nb_outfile_seeks = 0
    buffer_index = 1
    for input_file in input_files:
        print(f"Treating buffer: {buffer_index}...")
        buffer_index += 1
        nb_infile_openings += 1

        involume = get_volume(input_file, infiles_volumes, infiles_partition)
        t1 = time.time()
        if not DONT_WRITE:
            data = file_manager.read_data_from_fp(input_file, slices=None)
        else:
            data = None
        t1 = time.time() - t1
        t_read += t1

        for outvolume in outfiles_volumes:
            if hypercubes_overlap(involume, outvolume):
                shape, t2, nb_outfile_seeks_tmp = write_to_outfile(
                    involume, outvolume, data, outfiles_partition, outdir_path,
                    O, file_manager, addition, tracker)
                t_write += t2
                vols_written.append(shape)
                # nb_outfile_openings += 1 already included in nb_outfile_seeks
                nb_outfile_seeks += nb_outfile_seeks_tmp

        file_manager.close_infiles()

    if DONT_WRITE:
        assert tracker.is_complete(((0, 0, 0), R))

    # print("\nShapes written:")
    # for row in vols_written:
    #     print(row)

    if clean_out_dir:
        print("Cleaning output directory")
        file_manager.clean_directory(outdir_path)

    get_opened_files()

    return t_read, t_write, [
        nb_outfile_openings, nb_outfile_seeks, nb_infile_openings,
        nb_infile_seeks
    ]