Exemplo n.º 1
0
    def __setitem__(self, key, value):
        assert isinstance(value, ndarray.ndarray), "assigned array has to be a pyDive ndarray"

        if key == slice(None):
            key = [sliceNone] * len(self.shape)

        # singe data access is not allowed
        assert not all(type(key_comp) is int for key_comp in key),\
            "single data access is not allowed"

        if not isinstance(key, list) and not isinstance(key, tuple):
            key = (key,)

        assert len(key) == len(self.shape)

        new_shape, clean_slices = helper.subWindow_of_shape(self.shape, key)

        assert new_shape == value.shape

        # create local slice objects for each engine
        local_slices = helper.createLocalSlices(clean_slices, value.distaxis, value.idx_ranges)

        # scatter slice objects to the engines
        view = com.getView()
        view.scatter('window', local_slices, targets=value.targets_in_use)

        # write 'value' to disk in parallel
        view.execute('%s[tuple(window[0])] = %s' % (self.dataset_name, repr(value)), \
            targets=value.targets_in_use)
Exemplo n.º 2
0
    def __repr__(self):
        # if arrays are distributed create a local representation of this object on engine
        if onTarget == 'False' and not self.has_local_instance and hasattr(
                self.firstArray, "target_ranks"):
            items = [item for item in treeItems(self.structOfArrays)]
            assert all(self.firstArray.is_distributed_like(a) for name, a in items),\
                "Cannot create a local virtual array-of-structs because not all arrays are distributed equally."

            self.distaxes = self.firstArray.distaxes
            self.target_offsets = self.firstArray.target_offsets
            self.target_ranks = self.firstArray.target_ranks
            view = com.getView()
            self.view = view

            # generate a unique variable name used on target representing this instance
            global arrayOfStructs_id
            self.name = 'arrayOfStructsObj' + str(arrayOfStructs_id)
            arrayOfStructs_id += 1

            # create a VirtualArrayOfStructs object containing the local arrays on the targets in use
            names_tree = makeTree_fromTree(self.structOfArrays,
                                           lambda a: repr(a))

            view.push({'names_tree': names_tree}, targets=self.target_ranks)

            view.execute('''\
                structOfArrays = structured.makeTree_fromTree(names_tree, lambda a_name: globals()[a_name])
                %s = structured.structured(structOfArrays)''' % self.name,\
                targets=self.target_ranks)

            self.has_local_instance = True

        return self.name
Exemplo n.º 3
0
    def __getitem__(self, args):
        if args ==  slice(None):
            args = (slice(None) for i in range(len(self.shape)))

        if not isinstance(args, list) and not isinstance(args, tuple):
            args = [args]

        assert len(self.shape) == len(args)

        # single data access is not allowed
        assert not all(type(arg) is int for arg in args),\
            "single data access is not allowed"

        new_shape, clean_slices = helper.subWindow_of_shape(self.shape, args)

        print "new_shape: ", new_shape
        print "clean_slices: ", clean_slices

        # result ndarray
        result = ndarray_factories.hollow(new_shape, self.distaxis, dtype=self.dtype)

        print "result.idx_ranges: ", result.idx_ranges

        # create local slice objects for each engine
        local_slices = helper.createLocalSlices(clean_slices, self.distaxis, result.idx_ranges)

        print "local_slices: ", local_slices

        # scatter slice objects to the engines
        view = com.getView()
        view.scatter('window', local_slices, targets=result.targets_in_use)

        view.execute('%s = %s[tuple(window[0])]' % (result.name, self.dataset_name), targets=result.targets_in_use)
        return result
Exemplo n.º 4
0
def binary_rop(rhs, lhs, op):
    lhs = __prepare_operand(lhs, rhs)

    result = ndarray_factories.hollow_like(rhs)
    view = com.getView()
    view.execute("%s = %s %s %s" % (repr(result), repr(lhs), op, repr(rhs)), targets=result.targets_in_use)
    return result
Exemplo n.º 5
0
    def __repr__(self):
        # if arrays are distributed create a local representation of this object on engine
        if onTarget == 'False' and not self.has_local_instance and hasattr(self.firstArray, "target_ranks"):
            items = [item for item in treeItems(self.structOfArrays)]
            assert all(self.firstArray.is_distributed_like(a) for name, a in items),\
                "Cannot create a local virtual array-of-structs because not all arrays are distributed equally."

            self.distaxes = self.firstArray.distaxes
            self.target_offsets = self.firstArray.target_offsets
            self.target_ranks = self.firstArray.target_ranks
            view = com.getView()
            self.view = view

            # generate a unique variable name used on target representing this instance
            global arrayOfStructs_id
            self.name = 'arrayOfStructsObj' + str(arrayOfStructs_id)
            arrayOfStructs_id += 1

            # create a VirtualArrayOfStructs object containing the local arrays on the targets in use
            names_tree = makeTree_fromTree(self.structOfArrays, lambda a: repr(a))

            view.push({'names_tree' : names_tree}, targets=self.target_ranks)

            view.execute('''\
                structOfArrays = structured.makeTree_fromTree(names_tree, lambda a_name: globals()[a_name])
                %s = structured.structured(structOfArrays)''' % self.name,\
                targets=self.target_ranks)

            self.has_local_instance = True

        return self.name
Exemplo n.º 6
0
def n_ary_fun(fun, *args):
    a = args[0]

    args = [a.name] + [__prepare_operand(arg, a) for arg in args[1:]]
    args_str = ", ".join(args)

    result = ndarray_factories.hollow_like(a)
    view = com.getView()
    view.execute("%s = %s(%s)" % (repr(result), fun, args_str), targets=result.targets_in_use)
    #\todo: determine dtype from the result of fun and not from args[0]
    return result
Exemplo n.º 7
0
    def __init__(self, shape, distaxis, dtype=np.float, idx_ranges=None, targets_in_use=None, no_allocation=False):
        self.shape = list(shape)
        self.dtype = dtype
        self.distaxis = distaxis
        self.view = com.getView()

        assert distaxis >= 0 and distaxis < len(self.shape)

        if idx_ranges is None and targets_in_use is None:
            # number of available targets (engines)
            num_targets_av = len(self.view.targets)

            # shape of the mpi-local ndarray
            localshape = np.array(self.shape)
            localshape[distaxis] = (self.shape[distaxis] - 1) / num_targets_av + 1
            tmp = localshape[distaxis]

            # number of occupied targets by this ndarray instance
            num_targets = (self.shape[distaxis] - 1) / localshape[distaxis] + 1

            # list of pairs on which each pair stores the range of indices [begin, end) for the distributed axis on each target
            # this is the decomposition of the distributed axis
            self.idx_ranges = [(r * tmp, (r+1) * tmp) for r in range(0, num_targets-1)]
            self.idx_ranges += [((num_targets-1) * tmp, self.shape[distaxis])]
            # list of indices of the occupied targets
            self.targets_in_use = list(range(num_targets))
        elif idx_ranges is not None and targets_in_use is not None:
            self.idx_ranges = idx_ranges[:]
            self.targets_in_use = targets_in_use[:]
        else:
            raise ValueError("either args 'idx_ranges' and 'targets_in_use' have to be given both or not given both.")

        # generate a unique variable name used on the target representing this instance
        global ndarray_id
        self.name = 'dist_ndarray' + str(ndarray_id)
        ndarray_id += 1

        if no_allocation:
            self.view.push({self.name : None}, targets=self.targets_in_use)
        else:
            # instanciate an empty ndarray object of the appropriate shape on each target in use
            localshapes = [self.shape[:] for i in range(len(self.targets_in_use))]
            for i in range(len(self.targets_in_use)):
                localshapes[i][distaxis] = self.idx_ranges[i][1] - self.idx_ranges[i][0]

            self.view.scatter('localshape', localshapes, targets=self.targets_in_use)
            self.view.push({'dtype' : dtype}, targets=self.targets_in_use)
            self.view.execute('%s = empty(localshape[0], dtype=dtype)' % self.name, targets=self.targets_in_use)
Exemplo n.º 8
0
def array(array_like, distaxis):
    # numpy array
    if isinstance(array_like, np.ndarray):
        # result ndarray
        result = ndarray.ndarray(array_like.shape, distaxis, array_like.dtype, no_allocation=True)

        tmp = np.rollaxis(array_like, distaxis)
        sub_arrays = [tmp[begin:end] for begin, end in result.idx_ranges]
        # roll axis back
        sub_arrays = [np.rollaxis(ar, 0, distaxis+1) for ar in sub_arrays]

        view = com.getView()
        view.scatter('sub_array', sub_arrays, targets=result.targets_in_use)
        view.execute("%s = sub_array[0].copy()" % result.name, targets=result.targets_in_use)

        return result
Exemplo n.º 9
0
def reduce(array, op):
    """Perform a tree-like reduction over all axes of *array*.

    :param array: *pyDive.ndarray*, *pyDive.h5_ndarray* or *pyDive.cloned_ndarray* to be reduced
    :param numpy-ufunc op: reduce operation, e.g. *numpy.add*.

    If the hdf5 data exceeds the memory limit (currently 25% of the combined main memory of all cluster nodes)\
    the data will be read block-wise so that a block fits into memory.
    """
    def reduce_wrapper(array_name, op_name):
        array = globals()[array_name]
        op =  eval("np." + op_name)
        return algorithm.__tree_reduce(array, axis=None, op=op) # reduction over all axes

    view = com.getView()

    tmp_targets = view.targets # save current target list
    if type(array) == VirtualArrayOfStructs:
        view.targets = array.firstArray.target_ranks
    else:
        view.targets = array.target_ranks

    result = None

    if (hasattr(array, "arraytype") and array.arraytype in hdd_arraytypes) or type(array) in hdd_arraytypes:
        for chunk in fragment(array):
            array_name = repr(chunk)

            targets_results = view.apply(interactive(reduce_wrapper), array_name, op.__name__)
            chunk_result = op.reduce(targets_results) # reduce over targets' results

            if result is None:
                result = chunk_result
            else:
                result = op(result, chunk_result)
    else:
        array_name = repr(array)

        targets_results = view.apply(interactive(reduce_wrapper), array_name, op.__name__)
        result = op.reduce(targets_results) # reduce over targets' results

    view.targets = tmp_targets # restore target list
    return result
Exemplo n.º 10
0
def __bestStepSize(arrays, axis, memory_limit):
    view = com.getView()

    # minimum amount of memory available and memory needed, both per engine
    get_mem_av_node = interactive(lambda: psutil.virtual_memory().available)
    tmp_targets = view.targets
    view.targets = 'all'
    mem_av = min(view.apply(get_mem_av_node)) / com.getPPN()
    mem_needed = sum(a.nbytes for a in arrays) / len(view)
    view.targets = tmp_targets

    # edge length of the whole array
    edge_length = arrays[0].shape[axis]
    # maximum edge length on one engine according to the available memory
    step_size = memory_limit * edge_length * mem_av / mem_needed

    if step_size >= edge_length:
        return edge_length

    # round 'step_size' down to nearest power of two
    return pow(2, int(math.log(step_size, 2)))
Exemplo n.º 11
0
def __bestStepSize(arrays, axis, memory_limit):
    view = com.getView()

    # minimum amount of memory available and memory needed, both per engine
    get_mem_av_node = interactive(lambda: psutil.virtual_memory().available)
    tmp_targets = view.targets
    view.targets = 'all'
    mem_av = min(view.apply(get_mem_av_node)) / com.getPPN()
    mem_needed = sum(a.nbytes for a in arrays) / len(view)
    view.targets = tmp_targets

    # edge length of the whole array
    edge_length = arrays[0].shape[axis]
    # maximum edge length on one engine according to the available memory
    step_size = memory_limit * edge_length * mem_av / mem_needed

    if step_size >= edge_length:
        return edge_length

    # round 'step_size' down to nearest power of two
    return pow(2, int(math.log(step_size, 2)))
Exemplo n.º 12
0
    def __init__(self, h5_filename, dataset_path, distaxis, max_elements_node=0):
        self.h5_filename = h5_filename
        self.dataset_path = dataset_path

        self.dataset = h5.File(h5_filename, 'r')[dataset_path]

        self.shape = list(self.dataset.shape)
        self.distaxis = distaxis
        self.max_elements_node = max_elements_node
        self.dtype = self.dataset.dtype

        # generate a unique variable name used on the target representing this instance
        global h5_ndarray_id
        self.name = 'h5_ndarray' + str(h5_ndarray_id)
        h5_ndarray_id += 1

        # create dataset object on each target
        self.dataset_name = self.name + "_dataset"
        self.fileHandle_name = self.name + "_file"
        view = com.getView()
        view.execute("%s = h5.File('%s', 'r', driver='mpio', comm=MPI.COMM_WORLD)"\
            % (self.fileHandle_name, h5_filename))
        view.execute("%s = %s['%s']"\
            % (self.dataset_name, self.fileHandle_name, dataset_path))
Exemplo n.º 13
0
def mapReduce(map_func, reduce_op, *arrays, **kwargs):
    """Applies *map_func* on :term:`engine` on local arrays related to *arrays*
    and reduces its result in a tree-like fashion over all axes.
    Example: ::

        cluster_array = pyDive.ones(shape=[100], distaxes=0)

        s = pyDive.mapReduce(lambda a: a**2, np.add, cluster_array) # a is the local numpy-array of *cluster_array*
        assert s == 100

    :param callable f: function to be called on :term:`engine`. Has to accept *numpy-arrays* and *kwargs*
    :param numpy-ufunc reduce_op: reduce operation, e.g. *numpy.add*.
    :param arrays: list of arrays including *pyDive.ndarrays*, *pyDive.h5_ndarrays* or *pyDive.cloned_ndarrays*
    :param kwargs: user-specified keyword arguments passed to *f*
    :raises AssertionError: if the *shapes* of *pyDive.ndarrays* and *pyDive.h5_ndarrays* do not match
    :raises AssertionError: if the *distaxes* attributes of *pyDive.ndarrays* and *pyDive.h5_ndarrays* do not match

    Notes:
        - If the hdf5 data exceeds the memory limit (currently 25% of the combined main memory of all cluster nodes)\
            the data will be read block-wise so that a block fits into memory.
        - *mapReduce* chooses the list of *engines* from the **first** element of *arrays*. On these engines the mapReduce will be executed.\
            If the first array is a *pyDive.h5_ndarray* all engines will be used.
        - *mapReduce* is not writing data back to a *pyDive.h5_ndarray* yet.
        - *mapReduce* does not equalize the element distribution of *pyDive.ndarrays* before execution.
    """
    def mapReduce_wrapper(map_func, reduce_op_name, array_names, **kwargs):
        arrays = [globals()[array_name] for array_name in array_names]
        reduce_op =  eval("np." + reduce_op_name)
        return algorithm.__tree_reduce(map_func(*arrays, **kwargs), axis=None, op=reduce_op)

    view = com.getView()
    tmp_targets = view.targets # save current target list
    if type(arrays[0]) == VirtualArrayOfStructs:
        view.targets = arrays[0].firstArray.target_ranks
    else:
        view.targets = arrays[0].target_ranks

    result = None

    hdd_arrays = [a for a in arrays if (hasattr(a, "arraytype") and a.arraytype in hdd_arraytypes) or type(a) in hdd_arraytypes]
    if hdd_arrays:
        cloned_arrays = [a for a in arrays if (hasattr(a, "arraytype") and a.arraytype is cloned_ndarray) or type(a) is cloned_ndarray]
        other_arrays = [a for a in arrays if not ((hasattr(a, "arraytype") and a.arraytype is cloned_ndarray) or type(a) is cloned_ndarray)]

        cloned_arrays_ids = [id(a) for a in cloned_arrays]
        other_arrays_ids = [id(a) for a in other_arrays]

        for fragments in fragment(*other_arrays):
            it_other_arrays = iter(other_arrays)
            it_cloned_arrays = iter(cloned_arrays)

            array_names = []
            for a in arrays:
                if id(a) in cloned_arrays_ids:
                    array_names.append(repr(it_cloned_arrays.next()))
                    continue
                if id(a) in other_arrays_ids:
                    array_names.append(repr(it_other_arrays.next()))
                    continue

            targets_results = view.apply(interactive(mapReduce_wrapper),\
                interactive(map_func), reduce_op.__name__, array_names, **kwargs)

            fragment_result = reduce_op.reduce(targets_results) # reduce over targets' results
            if result is None:
                result = fragment_result
            else:
                result = reduce_op(result, fragment_result)
    else:
        array_names = [repr(a) for a in arrays]
        targets_results = view.apply(interactive(mapReduce_wrapper),\
            interactive(map_func), reduce_op.__name__, array_names, **kwargs)

        result = reduce_op.reduce(targets_results) # reduce over targets' results

    view.targets = tmp_targets # restore target list

    return result
Exemplo n.º 14
0
def map(f, *arrays, **kwargs):
    """Applies *f* on :term:`engine` on local arrays related to *arrays*.
    Example: ::

        cluster_array = pyDive.ones(shape=[100], distaxes=0)

        cluster_array *= 2.0
        # equivalent to
        pyDive.map(lambda a: a *= 2.0, cluster_array) # a is the local numpy-array of *cluster_array*

    Or, as a decorator: ::

        @pyDive.map
        def twice(a):
            a *= 2.0

        twice(cluster_array)

    :param callable f: function to be called on :term:`engine`. Has to accept *numpy-arrays* and *kwargs*
    :param arrays: list of arrays including *pyDive.ndarrays*, *pyDive.h5_ndarrays* or *pyDive.cloned_ndarrays*
    :param kwargs: user-specified keyword arguments passed to *f*
    :raises AssertionError: if the *shapes* of *pyDive.ndarrays* and *pyDive.h5_ndarrays* do not match
    :raises AssertionError: if the *distaxes* attributes of *pyDive.ndarrays* and *pyDive.h5_ndarrays* do not match

    Notes:
        - If the hdf5 data exceeds the memory limit (currently 25% of the combined main memory of all cluster nodes)\
            the data will be read block-wise so that a block fits into memory.
        - *map* chooses the list of *engines* from the **first** element of *arrays*. On these engines *f* is called.\
            If the first array is a *pyDive.h5_ndarray* all engines will be used.
        - *map* is not writing data back to a *pyDive.h5_ndarray* yet.
        - *map* does not equalize the element distribution of *pyDive.ndarrays* before execution.
    """
    if not arrays:
        # decorator mode
        def map_deco(*arrays, **kwargs):
            map(f, *arrays, **kwargs)
        return map_deco

    def map_wrapper(f, array_names, **kwargs):
        arrays = [globals()[array_name] for array_name in array_names]
        f(*arrays, **kwargs)

    view = com.getView()

    tmp_targets = view.targets # save current target list
    if type(arrays[0]) == VirtualArrayOfStructs:
        view.targets = arrays[0].firstArray.target_ranks
    else:
        view.targets = arrays[0].target_ranks

    hdd_arrays = [a for a in arrays if (hasattr(a, "arraytype") and a.arraytype in hdd_arraytypes) or type(a) in hdd_arraytypes]
    if hdd_arrays:
        cloned_arrays = [a for a in arrays if (hasattr(a, "arraytype") and a.arraytype is cloned_ndarray) or type(a) is cloned_ndarray]
        other_arrays = [a for a in arrays if not ((hasattr(a, "arraytype") and a.arraytype is cloned_ndarray) or type(a) is cloned_ndarray)]

        cloned_arrays_ids = [id(a) for a in cloned_arrays]
        other_arrays_ids = [id(a) for a in other_arrays]

        for fragments in fragment(*other_arrays):
            it_other_arrays = iter(other_arrays)
            it_cloned_arrays = iter(cloned_arrays)

            array_names = []
            for a in arrays:
                if id(a) in cloned_arrays_ids:
                    array_names.append(repr(it_cloned_arrays.next()))
                    continue
                if id(a) in other_arrays_ids:
                    array_names.append(repr(it_other_arrays.next()))
                    continue

            view.apply(interactive(map_wrapper), interactive(f), array_names, **kwargs)
    else:
        array_names = [repr(a) for a in arrays]
        view.apply(interactive(map_wrapper), interactive(f), array_names, **kwargs)

    view.targets = tmp_targets # restore target list
Exemplo n.º 15
0
 def __del__(self):
     view = com.getView()
     view.execute("%s.close()" % self.fileHandle_name)
Exemplo n.º 16
0
def unary_op(a, op):
    result = ndarray_factories.hollow_like(a)
    view = com.getView()
    view.execute("%s = %s%s" % (repr(result), op, repr(a)), targets=result.targets_in_use)
    return result
Exemplo n.º 17
0
def binary_iop(lhs, rhs, iop):
    rhs = __prepare_operand(rhs, lhs)

    view = com.getView()
    view.execute("%s %s %s" % (repr(lhs), iop, repr(rhs)), targets=lhs.targets_in_use)
    return lhs