예제 #1
0
    def _kernel(result, source, powers, powers_idx):

        if powers_view:
            powers = powers.reshape(product(batch_shape),
                                    powers_shape[-1])[:, powers_idx]
        else:
            powers = powers.flatten()

        result = result.reshape(product(batch_shape),
                                product(poly_batch_shape), polynomial_degree)
        source = source.reshape(product(batch_shape),
                                product(poly_batch_shape), polynomial_degree)

        if invert_powers:
            powers = 2 * polynomial_degree - powers

        for i in range(result.shape[0]):
            power = powers[i]
            if power < polynomial_degree:
                result[i, :, :power] = -source[i, :, (polynomial_degree -
                                                      power):polynomial_degree]
                result[i, :, power:polynomial_degree] = source[i, :, :(
                    polynomial_degree - power)]
            else:
                power = power - polynomial_degree
                result[i, :, :power] = source[i, :, (polynomial_degree -
                                                     power):polynomial_degree]
                result[i, :, power:polynomial_degree] = -source[i, :, :(
                    polynomial_degree - power)]

        if minus_one:
            result -= source
예제 #2
0
def test_group_dimensions(virtual_shape, available_shape):
    """
    Tests that ``group_dimensions()`` obeys its contracts.
    """
    v_groups, a_groups = vsize.group_dimensions(virtual_shape, available_shape)
    v_dims = []
    a_dims = []
    for v_group, a_group in zip(v_groups, a_groups):

        # Check that axis indices in groups are actually in range
        assert any(vdim < len(virtual_shape) for vdim in v_group)
        assert any(adim < len(available_shape) for adim in a_group)

        # Check that the total number of elements (threads) in the virtual group
        # is not greater than the number of elements in the real group
        v_shape = virtual_shape[v_group[0]:v_group[-1]+1]
        a_shape = available_shape[a_group[0]:a_group[-1]+1]
        assert(product(v_shape) <= product(a_shape))

        v_dims += v_group
        a_dims += a_group

    # Check that both virtual and real groups axes add up to a successive list
    # without intersections.
    assert v_dims == list(range(len(virtual_shape)))
    assert a_dims == list(range(len(available_shape[:len(a_dims)])))
예제 #3
0
    def __call__(self, psi):
        psi_nobs = psi.get()
        psi = beam_splitter(psi_nobs)

        ns_nobs = numpy.abs(psi_nobs) ** 2 - 0.5 / product(self.ds)
        ns = numpy.abs(psi) ** 2 - 0.5 / product(self.ds)

        if len(psi.shape) == 3:
            # 1D
            n_nobs = ns_nobs.mean(1)
            n = ns.mean(1)
            Ns = (ns * self.ds[0]).sum(-1)
        elif len(psi.shape) == 4:
            # 2D
            n_nobs = ns_nobs.mean(1)
            n = ns.mean(1)
            Ns = (ns * product(self.ds)).sum(-1).sum(-1)
        elif len(psi.shape) == 5:
            # 3D
            n_nobs = (ns_nobs.mean(1) * self.ds[1] * self.ds[2]).sum(-1).sum(-1)
            n = (ns.mean(1) * self.ds[1] * self.ds[2]).sum(-1).sum(-1)
            Ns = (ns * product(self.ds)).sum(-1).sum(-1).sum(-1)

        res = dict(
            Nplus_mean=Ns[0].mean(), Nminus_mean=Ns[1].mean(),
            Nplus_std=Ns[0].std(), Nminus_std=Ns[1].std(),
            density=n,
            density_nobs=n_nobs)

        if len(psi.shape) == 5:
            res['slice_nobs'] = ns_nobs.mean(1)[:,:,:,ns.shape[-1] / 2]
            res['slice'] = ns.mean(1)[:,:,:,ns.shape[-1] / 2]

        return res
예제 #4
0
파일: fft.py 프로젝트: mgolub2/reikna
    def __init__(self, dtype, device_params, outer_shape, fft_size, curr_size,
            fft_size_real, inner_shape, pass_num, reverse_direction):

        num_passes = len(get_global_radix_info(fft_size)[0])
        real_output_shape = (pass_num == num_passes - 1 and reverse_direction)

        self.name = 'fft_global'
        self.inplace_possible = (pass_num == num_passes - 1 and num_passes % 2 == 1)
        self.output_shape = (outer_shape +
            (fft_size_real if real_output_shape else fft_size,) + inner_shape)
        if fft_size != fft_size_real and pass_num == 0 and reverse_direction:
            self.kweights = get_kweights(fft_size_real, fft_size)
        else:
            self.kweights = None

        self._fft_size = fft_size
        self._curr_size = curr_size
        self._fft_size_real = fft_size_real
        self._local_mem_size = device_params.local_mem_size
        self._itemsize = dtype.itemsize
        self._inner_batch = helpers.product(inner_shape)
        self._outer_batch = helpers.product(outer_shape)
        self._pass_num = pass_num
        self._last_pass = (pass_num == num_passes - 1)

        self._constant_kwds = get_common_kwds(dtype, device_params)
        self._constant_kwds.update(dict(
            takes_kweights=(self.kweights is not None),
            input_slices=(len(outer_shape), 1, len(inner_shape)),
            output_slices=(len(outer_shape), 1, len(inner_shape)),
            pad_in=(fft_size != fft_size_real and pass_num == 0 and not reverse_direction),
            unpad_out=(fft_size != fft_size_real and self._last_pass and reverse_direction),
            reverse_direction=reverse_direction,
            normalize=self._last_pass))
예제 #5
0
def test_group_dimensions(virtual_shape, available_shape):
    """
    Tests that ``group_dimensions()`` obeys its contracts.
    """
    v_groups, a_groups = vsize.group_dimensions(virtual_shape, available_shape)
    v_dims = []
    a_dims = []
    for v_group, a_group in zip(v_groups, a_groups):

        # Check that axis indices in groups are actually in range
        assert any(vdim < len(virtual_shape) for vdim in v_group)
        assert any(adim < len(available_shape) for adim in a_group)

        # Check that the total number of elements (threads) in the virtual group
        # is not greater than the number of elements in the real group
        v_shape = virtual_shape[v_group[0]:v_group[-1]+1]
        a_shape = available_shape[a_group[0]:a_group[-1]+1]
        assert(product(v_shape) <= product(a_shape))

        v_dims += v_group
        a_dims += a_group

    # Check that both virtual and real groups axes add up to a successive list
    # without intersections.
    assert v_dims == list(range(len(virtual_shape)))
    assert a_dims == list(range(len(available_shape[:len(a_dims)])))
예제 #6
0
def check_performance(thr_and_double, shape_and_axes):

    thr, double = thr_and_double

    dtype = numpy.complex128 if double else numpy.complex64
    dtype = dtypes.normalize_type(dtype)

    shape, axes = shape_and_axes

    data = numpy.arange(product(shape)).reshape(shape).astype(dtype)

    shift = FFTShift(data, axes=axes)
    shiftc = shift.compile(thr)

    data_dev = thr.to_device(data)
    res_dev = thr.empty_like(data)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        shiftc(res_dev, data_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    res_ref = numpy.fft.fftshift(data, axes=axes)
    assert diff_is_negligible(res_dev.get(), res_ref)

    return min(times), product(shape) * dtype.itemsize
예제 #7
0
파일: fft.py 프로젝트: ringw/reikna
    def prepare_for(self, max_local_size):
        kwds = dict(self._constant_kwds)

        radix_arr, radix1_arr, radix2_arr = get_global_radix_info(
            self._fft_size)

        radix = radix_arr[self._pass_num]
        radix1 = radix1_arr[self._pass_num]
        radix2 = radix2_arr[self._pass_num]

        stride_out = self._inner_batch * helpers.product(
            radix_arr[:self._pass_num])
        stride = stride_out * radix
        stride_in = stride_out * helpers.product(
            radix_arr[self._pass_num + 1:])

        threads_per_xform = radix2

        coalesce_width = kwds['min_mem_coalesce_width']
        local_batch = max_local_size if radix2 == 1 else coalesce_width
        local_batch = min(local_batch, stride_in)
        local_size = min(local_batch * threads_per_xform, max_local_size)
        local_batch = local_size // threads_per_xform

        workgroups_num = helpers.min_blocks(stride_in,
                                            local_batch) * self._outer_batch

        if radix2 == 1:
            lmem_size = 0
        else:
            if stride_out == 1:
                lmem_size = (radix + 1) * local_batch
            else:
                lmem_size = local_size * radix1

        if lmem_size * self._itemsize // 2 > self._local_mem_size:
            raise OutOfResourcesError

        kwds.update(self._constant_kwds)
        kwds.update(
            dict(fft_size=self._fft_size,
                 curr_size=self._curr_size,
                 fft_size_real=self._fft_size_real,
                 pass_num=self._pass_num,
                 lmem_size=lmem_size,
                 local_batch=local_batch,
                 local_size=local_size,
                 inner_batch=self._inner_batch,
                 radix_arr=radix_arr,
                 radix1_arr=radix1_arr,
                 radix2_arr=radix2_arr,
                 radix1=radix1,
                 radix2=radix2,
                 radix=radix,
                 stride_in=stride_in,
                 stride_out=stride_out,
                 stride=stride,
                 last_pass=self._last_pass))

        return workgroups_num * local_size, local_size, kwds
예제 #8
0
    def __init__(self, virtual_shape, available_shape):
        self.real_dims = {}
        self.real_strides = {}
        self.virtual_strides = {}
        self.major_vdims = {}
        self.bounding_shape = tuple()
        self.skip_thresholds = []

        v_groups, a_groups = group_dimensions(virtual_shape, available_shape)

        for v_group, a_group in zip(v_groups, a_groups):
            virtual_subshape = virtual_shape[v_group[0]:v_group[-1]+1]
            virtual_subsize = product(virtual_subshape)

            bounding_subshape = find_bounding_shape(
                virtual_subsize,
                available_shape[a_group[0]:a_group[-1]+1])

            self.bounding_shape += bounding_subshape

            if virtual_subsize < product(bounding_subshape):
                strides = [(adim, product(bounding_subshape[:i])) for i, adim in enumerate(a_group)]
                self.skip_thresholds.append((virtual_subsize, strides))

            for vdim in v_group:
                self.real_dims[vdim] = a_group
                self.real_strides[vdim] = tuple(
                    product(self.bounding_shape[a_group[0]:adim]) for adim in a_group)
                self.virtual_strides[vdim] = product(virtual_shape[v_group[0]:vdim])
                self.major_vdims[vdim] = v_group[-1]
예제 #9
0
def find_local_size(global_size, flat_local_size, threshold=0.05):
    """
    Returns a tuple of the same size as ``global_size``,
    with the product equal to ``flat_local_size``,
    and minimal difference between ``product(global_size)``
    and ``product(min_blocks(gs, ls) for gs, ls in zip(global_size, local_size))``
    (i.e. tries to minimize the amount of empty threads).
    """
    flat_global_size = product(global_size)
    if flat_local_size >= flat_global_size:
        return global_size

    threads_num = product(global_size)

    best_ratio = None
    best_local_size = None

    for local_size in get_decompositions(flat_local_size, len(global_size)):
        bounding_global_size = tuple(
            ls * min_blocks(gs, ls) for gs, ls in zip(global_size, local_size))
        empty_threads = product(bounding_global_size) - threads_num
        ratio = float(empty_threads) / threads_num

        # Stopping iteration early, because there may be a lot of elements to iterate over,
        # and we do not need the perfect solution.
        if ratio < threshold:
            return local_size

        if best_ratio is None or ratio < best_ratio:
            best_ratio = ratio
            best_local_size = local_size

    return best_local_size
예제 #10
0
파일: fft.py 프로젝트: jakirkham/reikna
    def __init__(self, dtype, device_params, outer_shape, fft_size, curr_size,
            fft_size_real, inner_shape, pass_num, reverse_direction):

        num_passes = len(get_global_radix_info(fft_size)[0])
        real_output_shape = (pass_num == num_passes - 1 and reverse_direction)

        self.name = 'fft_global'
        self.inplace_possible = (pass_num == num_passes - 1 and num_passes % 2 == 1)
        self.output_shape = (outer_shape +
            (fft_size_real if real_output_shape else fft_size,) + inner_shape)
        if fft_size != fft_size_real and pass_num == 0 and reverse_direction:
            self.kweights = get_kweights(fft_size_real, fft_size)
        else:
            self.kweights = None

        self._fft_size = fft_size
        self._curr_size = curr_size
        self._fft_size_real = fft_size_real
        self._local_mem_size = device_params.local_mem_size
        self._itemsize = dtype.itemsize
        self._inner_batch = helpers.product(inner_shape)
        self._outer_batch = helpers.product(outer_shape)
        self._pass_num = pass_num
        self._last_pass = (pass_num == num_passes - 1)

        self._constant_kwds = get_common_kwds(dtype, device_params)
        self._constant_kwds.update(dict(
            takes_kweights=(self.kweights is not None),
            input_slices=(len(outer_shape), 1, len(inner_shape)),
            output_slices=(len(outer_shape), 1, len(inner_shape)),
            pad_in=(fft_size != fft_size_real and pass_num == 0 and not reverse_direction),
            unpad_out=(fft_size != fft_size_real and self._last_pass and reverse_direction),
            reverse_direction=reverse_direction,
            normalize=self._last_pass))
예제 #11
0
    def _add_transpose(self, plan, device_params,
            mem_out, mem_in, batch_shape, height_shape, width_shape):

        bso = self._block_width_override
        block_width = device_params.local_mem_banks if bso is None else bso

        if block_width ** 2 > device_params.max_work_group_size:
            # If it is not CPU, current solution may affect performance
            block_width = int(numpy.sqrt(device_params.max_work_group_size))

        input_height = helpers.product(height_shape)
        input_width = helpers.product(width_shape)
        batch = helpers.product(batch_shape)

        blocks_per_matrix = helpers.min_blocks(input_height, block_width)
        grid_width = helpers.min_blocks(input_width, block_width)

        render_kwds = dict(
            input_width=input_width, input_height=input_height, batch=batch,
            block_width=block_width,
            grid_width=grid_width,
            blocks_per_matrix=blocks_per_matrix,
            input_slices=[len(batch_shape), len(height_shape), len(width_shape)],
            output_slices=[len(batch_shape), len(width_shape), len(height_shape)])

        plan.kernel_call(
            TEMPLATE.get_def('transpose'), [mem_out, mem_in],
            global_size=(batch, blocks_per_matrix * block_width, grid_width * block_width),
            local_size=(1, block_width, block_width),
            render_kwds=render_kwds)
예제 #12
0
    def _add_transpose(self, plan, device_params,
            mem_out, mem_in, batch_shape, height_shape, width_shape):

        bso = self._block_width_override
        block_width = device_params.local_mem_banks if bso is None else bso

        if block_width ** 2 > device_params.max_work_group_size:
            # If it is not CPU, current solution may affect performance
            block_width = int(numpy.sqrt(device_params.max_work_group_size))

        input_height = helpers.product(height_shape)
        input_width = helpers.product(width_shape)
        batch = helpers.product(batch_shape)

        blocks_per_matrix = helpers.min_blocks(input_height, block_width)
        grid_width = helpers.min_blocks(input_width, block_width)

        render_kwds = dict(
            input_width=input_width, input_height=input_height, batch=batch,
            block_width=block_width,
            grid_width=grid_width,
            blocks_per_matrix=blocks_per_matrix,
            input_slices=[len(batch_shape), len(height_shape), len(width_shape)],
            output_slices=[len(batch_shape), len(width_shape), len(height_shape)])

        plan.kernel_call(
            TEMPLATE.get_def('transpose'), [mem_out, mem_in],
            kernel_name="kernel_transpose",
            global_size=(batch, blocks_per_matrix * block_width, grid_width * block_width),
            local_size=(1, block_width, block_width),
            render_kwds=render_kwds)
예제 #13
0
def check_performance(thr_and_double, shape_and_axes):

    thr, double = thr_and_double

    dtype = numpy.complex128 if double else numpy.complex64
    dtype = dtypes.normalize_type(dtype)

    shape, axes = shape_and_axes

    data = numpy.arange(product(shape)).reshape(shape).astype(dtype)

    shift = FFTShift(data, axes=axes)
    shiftc = shift.compile(thr)

    data_dev = thr.to_device(data)
    res_dev = thr.empty_like(data)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        shiftc(res_dev, data_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    res_ref = numpy.fft.fftshift(data, axes=axes)
    assert diff_is_negligible(res_dev.get(), res_ref)

    return min(times), product(shape) * dtype.itemsize
예제 #14
0
def group_dimensions(virtual_shape, available_shape):
    """
    Returns two lists, one of tuples with numbers of grouped virtual dimensions, the other
    one of tuples with numbers of corresponding group of available dimensions,
    such that for any group of virtual dimensions, the total number of elements they cover
    does not exceed the number of elements covered by the
    corresponding group of available dimensions.
    """
    assert product(virtual_shape) <= product(available_shape)
    return _group_dimensions(0, virtual_shape, 0, available_shape)
예제 #15
0
 def __init__(self, dtype, shape=None, strides=None):
     self.shape = tuple() if shape is None else wrap_in_tuple(shape)
     self.size = product(self.shape)
     self.dtype = dtypes.normalize_type(dtype)
     self.ctype = dtypes.ctype_module(self.dtype)
     if strides is None:
         self.strides = tuple([
             self.dtype.itemsize * product(self.shape[i+1:]) for i in range(len(self.shape))])
     else:
         self.strides = strides
     self._cast = dtypes.cast(self.dtype)
예제 #16
0
파일: matrixmul.py 프로젝트: xexo7C8/reikna
    def _build_plan(self, plan_factory, device_params, output, matrix_a,
                    matrix_b):
        bwo = self._block_width_override

        if bwo is not None:
            block_widths = [bwo]
        else:
            nbanks = device_params.local_mem_banks
            block_widths = [2**n for n in range(helpers.log2(nbanks), -1, -1)]

        a_batch = helpers.product(matrix_a.shape[:-2])
        b_batch = helpers.product(matrix_b.shape[:-2])
        batch = max(a_batch, b_batch)

        for block_width in block_widths:

            plan = plan_factory()

            if block_width**2 > device_params.max_work_group_size:
                continue

            num_steps = helpers.min_blocks(self._convolution_size, block_width)
            a_blocks = helpers.min_blocks(self._a_outer_size, block_width)
            b_blocks = helpers.min_blocks(self._b_outer_size, block_width)

            render_kwds = dict(batched_a=(a_batch != 1),
                               batched_b=(b_batch != 1),
                               transposed_a=self._transposed_a,
                               transposed_b=self._transposed_b,
                               num_steps=num_steps,
                               a_slices=(len(matrix_a.shape) - 2, 1, 1),
                               b_slices=(len(matrix_b.shape) - 2, 1, 1),
                               output_slices=(len(output.shape) - 2, 1, 1),
                               block_width=block_width,
                               mul=functions.mul(matrix_a.dtype,
                                                 matrix_b.dtype,
                                                 out_dtype=output.dtype))

            try:
                plan.kernel_call(TEMPLATE.get_def('matrixmul'),
                                 [output, matrix_a, matrix_b],
                                 kernel_name="kernel_matrixmul",
                                 global_size=(batch, a_blocks * block_width,
                                              b_blocks * block_width),
                                 local_size=(1, block_width, block_width),
                                 render_kwds=render_kwds)
            except OutOfResourcesError:
                continue

            return plan

        raise ValueError(
            "Could not find suitable call parameters for the kernel")
예제 #17
0
파일: matrixmul.py 프로젝트: fjarri/reikna
    def _build_plan(self, plan_factory, device_params, output, matrix_a, matrix_b):
        bwo = self._block_width_override

        if bwo is not None:
            block_widths = [bwo]
        else:
            nbanks = device_params.local_mem_banks
            block_widths = [2 ** n for n in range(helpers.log2(nbanks), -1, -1)]

        a_batch = helpers.product(matrix_a.shape[:-2])
        b_batch = helpers.product(matrix_b.shape[:-2])
        batch = max(a_batch, b_batch)

        for block_width in block_widths:

            plan = plan_factory()

            if block_width ** 2 > device_params.max_work_group_size:
                continue

            num_steps = helpers.min_blocks(self._convolution_size, block_width)
            a_blocks = helpers.min_blocks(self._a_outer_size, block_width)
            b_blocks = helpers.min_blocks(self._b_outer_size, block_width)

            render_kwds = dict(
                batched_a=(a_batch != 1),
                batched_b=(b_batch != 1),
                transposed_a=self._transposed_a,
                transposed_b=self._transposed_b,
                num_steps=num_steps,
                a_slices=(len(matrix_a.shape) - 2, 1, 1),
                b_slices=(len(matrix_b.shape) - 2, 1, 1),
                output_slices=(len(output.shape) - 2, 1, 1),
                block_width=block_width,
                mul=functions.mul(matrix_a.dtype, matrix_b.dtype, out_dtype=output.dtype))

            try:
                plan.kernel_call(
                    TEMPLATE.get_def('matrixmul'),
                    [output, matrix_a, matrix_b],
                    kernel_name="kernel_matrixmul",
                    global_size=(
                        batch,
                        a_blocks * block_width,
                        b_blocks * block_width),
                    local_size=(1, block_width, block_width),
                    render_kwds=render_kwds)
            except OutOfResourcesError:
                continue

            return plan

        raise ValueError("Could not find suitable call parameters for the kernel")
예제 #18
0
파일: signature.py 프로젝트: ringw/reikna
 def __init__(self, dtype, shape=None, strides=None):
     self.shape = tuple() if shape is None else wrap_in_tuple(shape)
     self.size = product(self.shape)
     self.dtype = dtypes.normalize_type(dtype)
     self.ctype = dtypes.ctype_module(self.dtype)
     if strides is None:
         self.strides = tuple([
             self.dtype.itemsize * product(self.shape[i + 1:])
             for i in range(len(self.shape))
         ])
     else:
         self.strides = strides
     self._cast = dtypes.cast(self.dtype)
예제 #19
0
    def compatible_with(self, other):
        if self.dtype != other.dtype:
            return False

        common_shape_len = min(len(self.shape), len(other.shape))
        if self.shape[-common_shape_len:] != other.shape[-common_shape_len:]:
            return False
        if self.strides[-common_shape_len:] != other.strides[-common_shape_len:]:
            return False
        if helpers.product(self.shape[:-common_shape_len]) != 1:
            return False
        if helpers.product(other.shape[:-common_shape_len]) != 1:
            return False

        return True
예제 #20
0
파일: test_vsizes.py 프로젝트: ringw/reikna
def test_find_local_size(global_size, flat_local_size, expected_local_size):
    """
    Checking that ``find_local_size`` finds the sizes we expect from it.
    """
    local_size = vsize.find_local_size(global_size, flat_local_size)
    assert product(local_size) == flat_local_size
    assert local_size == expected_local_size
예제 #21
0
파일: fft.py 프로젝트: mgolub2/reikna
    def __init__(self, dtype, device_params, outer_shape, fft_size, fft_size_real,
            inner_shape, reverse_direction):

        self.name = "fft_local"
        self.inplace_possible = True
        self.output_shape = outer_shape + (fft_size_real if reverse_direction else fft_size,)
        if fft_size_real != fft_size and reverse_direction:
            self.kweights = get_kweights(fft_size_real, fft_size)
        else:
            self.kweights = None

        self._fft_size = fft_size
        self._fft_size_real = fft_size_real
        self._outer_batch = helpers.product(outer_shape)
        self._local_mem_size = device_params.local_mem_size
        self._itemsize = dtype.itemsize

        self._constant_kwds = get_common_kwds(dtype, device_params)
        self._constant_kwds.update(dict(
            takes_kweights=(self.kweights is not None),
            input_slices=(len(outer_shape), 1, len(inner_shape)),
            output_slices=(len(outer_shape), 1, len(inner_shape)),
            pad_in=(fft_size != fft_size_real and not reverse_direction),
            unpad_out=(fft_size != fft_size_real and reverse_direction),
            reverse_direction=reverse_direction,
            normalize=True))
예제 #22
0
    def __init__(self, dtype, shape=None, strides=None, offset=0, nbytes=None):
        self.shape = tuple() if shape is None else wrap_in_tuple(shape)
        self.size = product(self.shape)
        self.dtype = dtypes.normalize_type(dtype)
        self.ctype = dtypes.ctype_module(self.dtype)

        default_strides = helpers.default_strides(self.shape,
                                                  self.dtype.itemsize)
        if strides is None:
            strides = default_strides
        else:
            strides = tuple(strides)
        self._default_strides = strides == default_strides
        self.strides = strides

        default_nbytes = helpers.min_buffer_size(self.shape,
                                                 self.dtype.itemsize,
                                                 self.strides)
        if nbytes is None:
            nbytes = default_nbytes
        self._default_nbytes = nbytes == default_nbytes
        self.nbytes = nbytes

        self.offset = offset
        self._cast = dtypes.cast(self.dtype)
예제 #23
0
def find_bounding_shape(virtual_size, available_shape):
    """
    Finds a tuple of the same length as ``available_shape``, with every element
    not greater than the corresponding element of ``available_shape``,
    and product not lower than ``virtual_size``.
    """
    assert virtual_size <= product(available_shape)

    free_size = virtual_size
    free_dims = set(range(len(available_shape)))
    bounding_shape = [None] * len(available_shape)

    while len(free_dims) > 0:
        guess = ceiling_root(free_size, len(free_dims))
        for fdim in free_dims:
            bounding_shape[fdim] = guess

        for fdim in free_dims:
            if bounding_shape[fdim] > available_shape[fdim]:
                bounding_shape[fdim] = available_shape[fdim]
                free_dims.remove(fdim)
                free_size = min_blocks(free_size, bounding_shape[fdim])
                break
        else:
            return tuple(bounding_shape)

    return tuple(available_shape)
예제 #24
0
    def _build_plan(self, plan_factory, device_params, lwe_a, lwe_b, accum_a, gsw, bara):

        params = self._params
        tlwe_params = params.tlwe_params
        decomp_length = params.decomp_length
        mask_size = tlwe_params.mask_size

        perf_params = self._perf_params
        transform_type = self._params.tlwe_params.transform_type
        transform = get_transform(transform_type)

        transform_module = transform.transform_module(perf_params, multi_iter=True)

        batch_shape = accum_a.shape[:-2]

        min_local_size = decomp_length * (mask_size + 1) * transform_module.threads_per_transform
        local_size = device_params.max_work_group_size
        while local_size >= min_local_size:

            plan = plan_factory()

            if transform_module.use_constant_memory:
                cdata_forward = plan.constant_array(transform_module.cdata_fw)
                cdata_inverse = plan.constant_array(transform_module.cdata_inv)
            else:
                cdata_forward = plan.persistent_array(transform_module.cdata_fw)
                cdata_inverse = plan.persistent_array(transform_module.cdata_inv)

            try:
                plan.kernel_call(
                    TEMPLATE.get_def("BlindRotate"),
                    [lwe_a, lwe_b, accum_a, gsw, bara, cdata_forward, cdata_inverse],
                    global_size=(
                        helpers.product(batch_shape),
                        local_size),
                    local_size=(1, local_size),
                    render_kwds=dict(
                        local_size=local_size,
                        slices=(len(batch_shape), 1, 1),
                        slices2=(len(batch_shape), 1),
                        slices3=(len(batch_shape),),
                        transform=transform_module,
                        mask_size=mask_size,
                        decomp_length=decomp_length,
                        output_size=self._in_out_params.size,
                        input_size=tlwe_params.extracted_lweparams.size,
                        bs_log2_base=self._params.bs_log2_base,
                        mul=transform.transformed_mul(perf_params),
                        add=transform.transformed_add(perf_params),
                        tr_ctype=transform.transformed_internal_ctype(),
                        min_blocks=helpers.min_blocks,
                        )
                    )
            except OutOfResourcesError:
                local_size -= transform_module.threads_per_transform
                continue

            return plan

        raise ValueError("Could not find suitable local size for the kernel")
예제 #25
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        batch_size = helpers.product(output.shape[:-1])
        blocks_num = helpers.min_blocks(batch_size, self._transforms_per_block)

        cdata_arr = self._transform.cdata_inv if self._inverse else self._transform.cdata_fw
        if self._transform.use_constant_memory:
            cdata = plan.constant_array(cdata_arr)
        else:
            cdata = plan.persistent_array(cdata_arr)

        plan.kernel_call(
            TEMPLATE.get_def('standalone_transform'), [output, input_, cdata],
            global_size=(blocks_num, self._transform.threads_per_transform *
                         self._transforms_per_block),
            local_size=(1, self._transform.threads_per_transform *
                        self._transforms_per_block),
            render_kwds=dict(inverse=self._inverse,
                             i32_conversion=self._i32_conversion,
                             kernel_repetitions=self._kernel_repetitions,
                             transform=self._transform,
                             transforms_per_block=self._transforms_per_block,
                             batch_size=batch_size,
                             blocks_num=blocks_num,
                             slices=(len(output.shape) - 1, 1)))

        return plan
예제 #26
0
파일: fftshift.py 프로젝트: fjarri/reikna
    def _build_plan(self, plan_factory, device_params, output, input_, inverse):

        if helpers.product([input_.shape[i] for i in self._axes]) == 1:
            return self._build_trivial_plan(plan_factory, output, input_)

        plan = plan_factory()

        axes = tuple(sorted(self._axes))
        shape = list(input_.shape)

        if all(shape[axis] % 2 == 0 for axis in axes):
        # If all shift axes have even length, it is possible to perform the shift inplace
        # (by swapping pairs of elements).
        # Note that the inplace fftshift is its own inverse.
            shape[axes[0]] //= 2
            plan.kernel_call(
                TEMPLATE.get_def('fftshift_inplace'), [output, input_],
                kernel_name="kernel_fftshift_inplace",
                global_size=shape,
                render_kwds=dict(axes=axes))
        else:
        # Resort to an out-of-place shift to a temporary array and then copy.
            temp = plan.temp_array_like(output)
            plan.kernel_call(
                TEMPLATE.get_def('fftshift_outplace'), [temp, input_, inverse],
                kernel_name="kernel_fftshift_outplace",
                global_size=shape,
                render_kwds=dict(axes=axes))

            copy_trf = copy(input_, out_arr_t=output)
            copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input)
            plan.computation_call(copy_comp, output, temp)

        return plan
예제 #27
0
파일: test_fft.py 프로젝트: ringw/reikna
def check_performance(thr_and_double, shape_and_axes, fast_math):
    thr, double = thr_and_double

    shape, axes = shape_and_axes
    dtype = numpy.complex128 if double else numpy.complex64

    data = get_test_array(shape, dtype)
    data_dev = thr.to_device(data)
    res_dev = thr.empty_like(data_dev)

    fft = FFT(data_dev, axes=axes)
    fftc = fft.compile(thr, fast_math=fast_math)

    attempts = 10
    t1 = time.time()
    for i in range(attempts):
        fftc(res_dev, data_dev)
    thr.synchronize()
    t2 = time.time()
    dev_time = (t2 - t1) / attempts

    fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype)
    assert diff_is_negligible(res_dev.get(), fwd_ref)

    return dev_time, product(shape) * sum([numpy.log2(shape[a]) for a in axes]) * 5
예제 #28
0
파일: signature.py 프로젝트: ringw/reikna
    def compatible_with(self, other):
        if self.dtype != other.dtype:
            return False

        common_shape_len = min(len(self.shape), len(other.shape))
        if self.shape[-common_shape_len:] != other.shape[-common_shape_len:]:
            return False
        if self.strides[-common_shape_len:] != other.strides[
                -common_shape_len:]:
            return False
        if helpers.product(self.shape[:-common_shape_len]) != 1:
            return False
        if helpers.product(other.shape[:-common_shape_len]) != 1:
            return False

        return True
예제 #29
0
파일: fft.py 프로젝트: mgolub2/reikna
    def _build_plan(self, plan_factory, device_params, output, input_, inverse):

        if helpers.product([input_.shape[i] for i in self._axes]) == 1:
            return self._build_trivial_plan(plan_factory, output, input_)

        # While resource consumption of GlobalFFTKernel can be made lower by passing
        # lower value to prepare_for(), LocalFFTKernel may have to be split into several kernels.
        # Therefore, if GlobalFFTKernel.prepare_for() raises OutOfResourcesError,
        # we just call prepare_for() with lower limit, but if LocalFFTKernel.prepare_for()
        # does that, we have to recreate the whole chain.
        local_kernel_limit = device_params.max_work_group_size

        while local_kernel_limit >= 1:
            try:
                plan = self._build_limited_plan(
                    plan_factory, device_params, local_kernel_limit, output, input_, inverse)
            except LocalKernelFail:
            # One of LocalFFTKernels was out of resources.
            # Reduce the limit and try to create operations from scratch again.
                local_kernel_limit //= 2
                continue
            except GlobalKernelFail:
                raise ValueError(
                    "Could not find suitable call parameters for one of the global kernels")

            return plan

        raise ValueError("Could not find suitable call parameters for one of the local kernels")
예제 #30
0
파일: fft.py 프로젝트: ringw/reikna
    def __init__(self, dtype, device_params, outer_shape, fft_size,
                 fft_size_real, inner_shape, reverse_direction):

        self.name = "fft_local"
        self.inplace_possible = True
        self.output_shape = outer_shape + (fft_size_real if reverse_direction
                                           else fft_size, )
        if fft_size_real != fft_size and reverse_direction:
            self.kweights = get_kweights(fft_size_real, fft_size)
        else:
            self.kweights = None

        self._fft_size = fft_size
        self._fft_size_real = fft_size_real
        self._outer_batch = helpers.product(outer_shape)
        self._local_mem_size = device_params.local_mem_size
        self._itemsize = dtype.itemsize

        self._constant_kwds = get_common_kwds(dtype, device_params)
        self._constant_kwds.update(
            dict(takes_kweights=(self.kweights is not None),
                 input_slices=(len(outer_shape), 1, len(inner_shape)),
                 output_slices=(len(outer_shape), 1, len(inner_shape)),
                 pad_in=(fft_size != fft_size_real and not reverse_direction),
                 unpad_out=(fft_size != fft_size_real and reverse_direction),
                 reverse_direction=reverse_direction,
                 normalize=True))
예제 #31
0
파일: fft.py 프로젝트: ringw/reikna
def get_fft_1d_kernels(dtype,
                       device_params,
                       outer_shape,
                       fft_size,
                       inner_shape,
                       local_kernel_limit,
                       reverse_direction=False,
                       fft_size_real=None):
    """Create and compile kernels for one of the dimensions"""

    kernels = []

    if fft_size_real is None:
        fft_size_real = fft_size

    if (helpers.product(inner_shape) == 1
            and fft_size // MAX_RADIX <= local_kernel_limit):
        kernels.append(
            LocalFFTKernel(dtype, device_params, outer_shape, fft_size,
                           fft_size_real, inner_shape, reverse_direction))
    else:
        kernels.extend(
            GlobalFFTKernel.create_chain(dtype, device_params, outer_shape,
                                         fft_size, fft_size_real, inner_shape,
                                         reverse_direction))

    return kernels
예제 #32
0
def test_find_local_size(global_size, flat_local_size, expected_local_size):
    """
    Checking that ``find_local_size`` finds the sizes we expect from it.
    """
    local_size = vsize.find_local_size(global_size, flat_local_size)
    assert product(local_size) == flat_local_size
    assert local_size == expected_local_size
예제 #33
0
파일: fft.py 프로젝트: ringw/reikna
    def _build_plan(self, plan_factory, device_params, output, input_,
                    inverse):

        if helpers.product([input_.shape[i] for i in self._axes]) == 1:
            return self._build_trivial_plan(plan_factory, output, input_)

        # While resource consumption of GlobalFFTKernel can be made lower by passing
        # lower value to prepare_for(), LocalFFTKernel may have to be split into several kernels.
        # Therefore, if GlobalFFTKernel.prepare_for() raises OutOfResourcesError,
        # we just call prepare_for() with lower limit, but if LocalFFTKernel.prepare_for()
        # does that, we have to recreate the whole chain.
        local_kernel_limit = device_params.max_work_group_size

        while local_kernel_limit >= 1:
            try:
                plan = self._build_limited_plan(plan_factory, device_params,
                                                local_kernel_limit, output,
                                                input_, inverse)
            except LocalKernelFail:
                # One of LocalFFTKernels was out of resources.
                # Reduce the limit and try to create operations from scratch again.
                local_kernel_limit //= 2
                continue
            except GlobalKernelFail:
                raise ValueError(
                    "Could not find suitable call parameters for one of the global kernels"
                )

            return plan

        raise ValueError(
            "Could not find suitable call parameters for one of the local kernels"
        )
예제 #34
0
파일: test_dht.py 프로젝트: ringw/reikna
    def __call__(self, *xs):
        """
        Evaluate function in coordinate space for given grid.
        """

        if len(xs) > 1:
            xxs = numpy.meshgrid(*xs, indexing="ij")
        else:
            xxs = xs

        res_shape = ((self.batch, )
                     if self.batch is not None else tuple()) + xxs[0].shape
        res = numpy.zeros(res_shape, self.dtype)

        for coeff, coord in self.modes:
            if self.batch is not None:
                b = coord[0]
                coord = coord[1:]
                target = res[b]
            else:
                target = res

            target += coeff * product(
                [self.harmonics[m](xx) for m, xx in zip(coord, xxs)])

        return res**self.order
예제 #35
0
    def _build_plan(self, plan_factory, device_params, output, input_,
                    inverse):

        if helpers.product([input_.shape[i] for i in self._axes]) == 1:
            return self._build_trivial_plan(plan_factory, output, input_)

        plan = plan_factory()

        axes = tuple(sorted(self._axes))
        shape = list(input_.shape)

        if all(shape[axis] % 2 == 0 for axis in axes):
            # If all shift axes have even length, it is possible to perform the shift inplace
            # (by swapping pairs of elements).
            # Note that the inplace fftshift is its own inverse.
            shape[axes[0]] //= 2
            plan.kernel_call(TEMPLATE.get_def('fftshift_inplace'),
                             [output, input_],
                             kernel_name="kernel_fftshift_inplace",
                             global_size=shape,
                             render_kwds=dict(axes=axes))
        else:
            # Resort to an out-of-place shift to a temporary array and then copy.
            temp = plan.temp_array_like(output)
            plan.kernel_call(TEMPLATE.get_def('fftshift_outplace'),
                             [temp, input_, inverse],
                             kernel_name="kernel_fftshift_outplace",
                             global_size=shape,
                             render_kwds=dict(axes=axes))

            copy_trf = copy(input_, out_arr_t=output)
            copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input)
            plan.computation_call(copy_comp, output, temp)

        return plan
예제 #36
0
def pytest_generate_tests(metafunc):

    errors_shapes_and_axes = [
        ((10, ), (0, )),
        ((11, ), (0, )),
        ((9000, ), (0, )),
        ((9001, ), (0, )),
        ((128, 60), (0, 1)),
        ((127, 60), (0, 1)),
        ((127, 61), (0, 1)),
        ((100, 80, 60), (0, 1, 2)),
        ((101, 80, 61), (0, 1, 2)),
        ((101, 80, 61), (0, 2)),
        ((20, 31, 80, 61), (0, 2)),
    ]

    perf_shapes = [
        (2**4, ),  # 1D, small size
        (2**18, ),  # 1D, large size
        (2**4, 2**4),  # 2D, small size
        (2**9, 2**9),  # 2D, large size
    ]
    perf_even_shapes_and_axes = []
    perf_odd_shapes_and_axes = []

    mem_limit = 2**22

    for contigous in (True, False):
        for shape in perf_shapes:
            batch = mem_limit // product(shape)
            if contigous:
                full_shape = (batch, ) + shape
                axes = tuple(range(1, len(shape) + 1))
            else:
                full_shape = shape + (batch, )
                axes = tuple(range(0, len(shape)))

            perf_even_shapes_and_axes.append((full_shape, axes))

            full_shape = list(full_shape)
            for axis in axes:
                full_shape[axis] -= 1
            perf_odd_shapes_and_axes.append((tuple(full_shape), axes))

    idgen = lambda pair: str(pair[0]) + '_over_' + str(pair[1])

    if 'errors_shape_and_axes' in metafunc.funcargnames:
        metafunc.parametrize('errors_shape_and_axes',
                             errors_shapes_and_axes,
                             ids=list(map(idgen, errors_shapes_and_axes)))

    elif 'perf_even_shape_and_axes' in metafunc.funcargnames:
        metafunc.parametrize('perf_even_shape_and_axes',
                             perf_even_shapes_and_axes,
                             ids=list(map(idgen, perf_even_shapes_and_axes)))

    elif 'perf_odd_shape_and_axes' in metafunc.funcargnames:
        metafunc.parametrize('perf_odd_shape_and_axes',
                             perf_odd_shapes_and_axes,
                             ids=list(map(idgen, perf_odd_shapes_and_axes)))
예제 #37
0
파일: fft.py 프로젝트: mgolub2/reikna
    def prepare_for(self, max_local_size):
        kwds = dict(self._constant_kwds)

        radix_arr, radix1_arr, radix2_arr = get_global_radix_info(self._fft_size)

        radix = radix_arr[self._pass_num]
        radix1 = radix1_arr[self._pass_num]
        radix2 = radix2_arr[self._pass_num]

        stride_out = self._inner_batch * helpers.product(radix_arr[:self._pass_num])
        stride = stride_out * radix
        stride_in = stride_out * helpers.product(radix_arr[self._pass_num+1:])

        threads_per_xform = radix2

        coalesce_width = kwds['min_mem_coalesce_width']
        local_batch = max_local_size if radix2 == 1 else coalesce_width
        local_batch = min(local_batch, stride_in)
        local_size = min(local_batch * threads_per_xform, max_local_size)
        local_batch = local_size // threads_per_xform

        workgroups_num = helpers.min_blocks(stride_in, local_batch) * self._outer_batch

        if radix2 == 1:
            lmem_size = 0
        else:
            if stride_out == 1:
                lmem_size = (radix + 1) * local_batch
            else:
                lmem_size = local_size * radix1

        if lmem_size * self._itemsize // 2 > self._local_mem_size:
            raise OutOfResourcesError

        kwds.update(self._constant_kwds)
        kwds.update(dict(
            fft_size=self._fft_size, curr_size=self._curr_size, fft_size_real=self._fft_size_real,
            pass_num=self._pass_num,
            lmem_size=lmem_size, local_batch=local_batch, local_size=local_size,
            inner_batch=self._inner_batch,
            radix_arr=radix_arr, radix1_arr=radix1_arr, radix2_arr=radix2_arr,
            radix1=radix1, radix2=radix2, radix=radix,
            stride_in=stride_in, stride_out=stride_out, stride=stride,
            last_pass=self._last_pass))

        return workgroups_num * local_size, local_size, kwds
예제 #38
0
파일: test_scan.py 프로젝트: fjarri/reikna
def test_large_scan_performance(thr, large_perf_shape, exclusive):
    """
    Large problem sizes.
    """
    dtype = dtypes.normalize_type(numpy.int64)
    min_time = check_scan(
        thr, large_perf_shape, dtype=dtype, axes=None, exclusive=exclusive, measure_time=True)
    return min_time, helpers.product(large_perf_shape) * dtype.itemsize
예제 #39
0
def test_find_bounding_shape(virtual_size, available_shape):
    """
    Tests that ``find_bounding_shape()`` obeys its contracts.
    """
    shape = vsize.find_bounding_shape(virtual_size, available_shape)
    assert all(isinstance(d, int) for d in shape)
    assert product(shape) >= virtual_size
    assert all(d <= ad for d, ad in zip(shape, available_shape))
예제 #40
0
def test_group_dimensions(virtual_shape, available_shape):
    """
    Tests that ``group_dimensions()`` obeys its contracts.
    """
    v_groups, a_groups = vsize.group_dimensions(virtual_shape, available_shape)
    v_dims = []
    a_dims = []
    for v_group, a_group in zip(v_groups, a_groups):
        v_shape = virtual_shape[v_group[0]:v_group[-1]+1]
        a_shape = available_shape[a_group[0]:a_group[-1]+1]
        assert(product(v_shape) <= product(a_shape))

        v_dims += v_group
        a_dims += a_group

    assert v_dims == list(range(len(virtual_shape)))
    assert a_dims == list(range(len(available_shape[:len(a_dims)])))
예제 #41
0
파일: test_vsizes.py 프로젝트: ringw/reikna
def test_find_bounding_shape(virtual_size, available_shape):
    """
    Tests that ``find_bounding_shape()`` obeys its contracts.
    """
    shape = vsize.find_bounding_shape(virtual_size, available_shape)
    assert all(isinstance(d, int) for d in shape)
    assert product(shape) >= virtual_size
    assert all(d <= ad for d, ad in zip(shape, available_shape))
예제 #42
0
파일: test_vsizes.py 프로젝트: ringw/reikna
def test_group_dimensions(virtual_shape, available_shape):
    """
    Tests that ``group_dimensions()`` obeys its contracts.
    """
    v_groups, a_groups = vsize.group_dimensions(virtual_shape, available_shape)
    v_dims = []
    a_dims = []
    for v_group, a_group in zip(v_groups, a_groups):
        v_shape = virtual_shape[v_group[0]:v_group[-1] + 1]
        a_shape = available_shape[a_group[0]:a_group[-1] + 1]
        assert (product(v_shape) <= product(a_shape))

        v_dims += v_group
        a_dims += a_group

    assert v_dims == list(range(len(virtual_shape)))
    assert a_dims == list(range(len(available_shape[:len(a_dims)])))
예제 #43
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        N = input_.shape[-1] * 2
        batch_shape = input_.shape[:-1]
        batch_size = helpers.product(batch_shape)

        coeffs1 = 4 * numpy.sin(2 * numpy.pi * numpy.arange(N // 2) / N)
        coeffs2 = 2 * numpy.cos(2 * numpy.pi * numpy.arange(N // 2) / N)

        c1_arr = plan.persistent_array(coeffs1)
        c2_arr = plan.persistent_array(coeffs2)

        multiply = get_multiply(input_)

        # re_X_1 = sum(x * coeffs2)

        t = plan.temp_array_like(input_)
        rd = Reduce(t,
                    predicate_sum(input_.dtype),
                    axes=(len(input_.shape) - 1, ))

        rd.parameter.input.connect(multiply,
                                   multiply.output,
                                   x=multiply.a,
                                   c2=multiply.b)

        re_X_0 = plan.temp_array_like(rd.parameter.output)
        plan.computation_call(rd, re_X_0, input_, c2_arr)

        # Y = numpy.fft.rfft(x * coeffs1)

        rfft = RFFT(input_, dont_store_last=True)
        rfft.parameter.input.connect(multiply,
                                     multiply.output,
                                     x=multiply.a,
                                     c1=multiply.b)

        Y = plan.temp_array_like(rfft.parameter.output)
        plan.computation_call(rfft, Y, input_, c1_arr)

        # Y *= -1j
        # Y[0] /= 2
        # Y[0] += re_X_1
        # res = numpy.cumsum(Y[:-1])

        prepare_prfft_scan = get_prepare_prfft_scan(Y)

        sc = Scan(Y, predicate_sum(Y.dtype), axes=(-1, ), exclusive=False)
        sc.parameter.input.connect(prepare_prfft_scan,
                                   prepare_prfft_scan.output,
                                   Y=prepare_prfft_scan.Y,
                                   re_X_0=prepare_prfft_scan.re_X_0)

        plan.computation_call(sc, output, Y, re_X_0)

        return plan
예제 #44
0
파일: tlwe_gpu.py 프로젝트: yzhaiustc/nufhe
    def _build_plan(
            self, plan_factory, device_params,
            result_a, result_cv, key, noises1, noises2):

        plan = plan_factory()

        polynomial_degree = self._polynomial_degree
        batch_shape = result_a.shape[:-2]
        batch_len = helpers.product(batch_shape)

        perf_params = self._perf_params

        transform = get_transform(self._transform_type)

        ft_key = transform.ForwardTransform(key.shape[:-1], polynomial_degree, perf_params)
        key_tr = plan.temp_array_like(ft_key.parameter.output)

        ft_noises = transform.ForwardTransform(noises1.shape[:-1], polynomial_degree, perf_params)
        noises1_tr = plan.temp_array_like(ft_noises.parameter.output)

        ift = transform.InverseTransform(noises1.shape[:-1], polynomial_degree, perf_params)
        ift_res = plan.temp_array_like(ift.parameter.output)

        mul_tr = Transformation(
            [
                Parameter('output', Annotation(ift.parameter.input, 'o')),
                Parameter('key', Annotation(key_tr, 'i')),
                Parameter('noises1', Annotation(noises1_tr, 'i'))
            ],
            """
            ${output.store_same}(${tr_ctype}unpack(${mul}(
                ${tr_ctype}pack(${key.load_idx}(${idxs[-2]}, ${idxs[-1]})),
                ${tr_ctype}pack(${noises1.load_same})
                )));
            """,
            connectors=['output', 'noises1'],
            render_kwds=dict(
                mul=transform.transformed_mul(perf_params),
                tr_ctype=transform.transformed_internal_ctype()))

        ift.parameter.input.connect(mul_tr, mul_tr.output, key=mul_tr.key, noises1=mul_tr.noises1)

        plan.computation_call(ft_key, key_tr, key)
        plan.computation_call(ft_noises, noises1_tr, noises1)
        plan.computation_call(ift, ift_res, key_tr, noises1_tr)
        plan.kernel_call(
            TEMPLATE.get_def("tlwe_encrypt_zero_fill_result"),
            [result_a, result_cv, noises1, noises2, ift_res],
            kernel_name="tlwe_encrypt_zero_fill_result",
            global_size=(batch_len, self._mask_size + 1, polynomial_degree),
            render_kwds=dict(
                noise=self._noise, mask_size=self._mask_size,
                noises1_slices=(len(batch_shape), 1, 1),
                noises2_slices=(len(batch_shape), 1),
                cv_slices=(len(batch_shape),)
                ))

        return plan
예제 #45
0
파일: cbrng.py 프로젝트: SyamGadde/reikna
    def _build_plan(self, plan_factory, _device_params, counters, randoms):

        plan = plan_factory()

        plan.kernel_call(
            TEMPLATE.get_def('cbrng'),
            [counters, randoms],
            global_size=helpers.product(counters.shape),
            render_kwds=dict(
                sampler=self._sampler,
                keygen=self._keygen,
                batch=helpers.product(randoms.shape[:-self._generators_dim]),
                counters_slices=[self._generators_dim],
                randoms_slices=[
                    len(randoms.shape) - self._generators_dim,
                    self._generators_dim]))

        return plan
예제 #46
0
파일: api.py 프로젝트: fjarri/reikna
    def __init__(self, thr, template_src, name, global_size, local_size=None,
            render_args=None, render_kwds=None, fast_math=False, compiler_options=None,
            constant_arrays=None, keep=False):
        """__init__()""" # hide the signature from Sphinx

        self._thr = thr

        if render_args is None:
            render_args = []
        if render_kwds is None:
            render_kwds = {}

        main_src = render_template_source(
            template_src, render_args=render_args, render_kwds=render_kwds)

        # Since virtual size function require some registers, they affect the maximum local size.
        # Start from the device's max work group size as the first approximation
        # and recompile kernels with smaller local sizes until convergence.
        max_local_size = thr.device_params.max_work_group_size

        while True:

            # Try to find kernel launch parameters for the requested local size.
            # May raise OutOfResourcesError if it's not possible,
            # just let it pass to the caller.
            vs = VirtualSizes(
                thr.device_params, global_size,
                virtual_local_size=local_size,
                max_local_size=max_local_size)

            # Try to compile the kernel with the corresponding virtual size functions
            program = Program(
                self._thr, vs.vsize_functions + main_src,
                static=True, fast_math=fast_math, compiler_options=compiler_options,
                constant_arrays=constant_arrays, keep=keep)
            kernel = getattr(program, name)

            if kernel.max_work_group_size >= product(vs.real_local_size):
                # Kernel will execute with this local size, use it
                break

            # By the contract of VirtualSizes,
            # product(vs.real_local_size) <= max_local_size
            # Also, since we're still in this loop,
            # kernel.max_work_group_size < product(vs.real_local_size).
            # Therefore the new max_local_size value is guaranteed
            # to be smaller than the previous one.
            max_local_size = kernel.max_work_group_size

        self._program = program
        self._kernel = kernel
        self.virtual_local_size = vs.virtual_local_size
        self.virtual_global_size = vs.virtual_global_size
        self.local_size = vs.real_local_size
        self.global_size = vs.real_global_size

        self._kernel.prepare(self.global_size, local_size=self.local_size)
예제 #47
0
파일: cbrng.py 프로젝트: ringw/reikna
    def _build_plan(self, plan_factory, _device_params, counters, randoms):

        plan = plan_factory()

        plan.kernel_call(TEMPLATE.get_def('cbrng'), [counters, randoms],
                         global_size=helpers.product(counters.shape),
                         render_kwds=dict(
                             sampler=self._sampler,
                             keygen=self._keygen,
                             batch=helpers.product(
                                 randoms.shape[:-self._generators_dim]),
                             counters_slices=[self._generators_dim],
                             randoms_slices=[
                                 len(randoms.shape) - self._generators_dim,
                                 self._generators_dim
                             ]))

        return plan
예제 #48
0
    def generate_modes(mshape, dtype, batch=None, random=True):
        """
        Generates list of sparse modes for the problem of given shape.
        """

        max_modes_per_batch = 20

        modelist = []
        if product(mshape) <= max_modes_per_batch:
            # If there are not many modes, fill all of them
            modenums = itertools.product(*[range(modes) for modes in mshape])
            if batch is not None:
                for b in range(batch):
                    modelist += [((b,) + modenum) for modenum in modenums]
            else:
                modelist += list(modenums)
        else:
            # If there are many modes, fill some random ones
            rand_coord = lambda: tuple(
                numpy.random.randint(0, mshape[i]) for i in range(len(mshape)))

            if batch is not None:
                for b in range(batch):
                    for i in range(max_modes_per_batch):
                        modelist.append((b,) + rand_coord())
            else:
                for i in range(max_modes_per_batch):
                    modelist.append(rand_coord())

        # add corner modes, to make sure extreme cases are still processed correctly
        corner_modes = itertools.product(*[(0, mshape[i]-1) for i in range(len(mshape))])
        for modenum in corner_modes:
            if batch is not None:
                for b in range(batch):
                    modelist.append((b,) + modenum)
            else:
                modelist.append(modenum)

        modelist = set(modelist) # remove duplicates

        # Assign coefficients
        modes = []
        for coord in modelist:
            get_coeff = lambda: numpy.random.normal() if random else 1
            if dtypes.is_complex(dtype):
                coeff = get_coeff() + 1j * get_coeff()
            else:
                coeff = get_coeff()
            coeff = dtype(coeff)

            # scaling coefficients for higher modes because of the lower precision in this case
            modenums = coord if batch is None else coord[1:]
            coeff /= sum(modenums) + 1
            modes.append((coeff, coord))

        return modes
예제 #49
0
    def __init__(self, shape, box, drift, trajectories=1, diffusion=None):

        if diffusion is not None:
            assert diffusion.dtype == drift.dtype
            assert diffusion.components == drift.components

            if not diffusion.real_noise or dtypes.is_real(drift.dtype):
                noise_dtype = drift.dtype
            else:
                noise_dtype = dtypes.real_for(drift.dtype)

            self.noise_type = Type(noise_dtype, (trajectories, diffusion.noise_sources) + shape)
            self.noise = True

            cell_volume = product(box) / product(shape)
            self._noise_normalization = 1. / cell_volume
        else:
            self.noise_type = None
            self.noise = False
예제 #50
0
파일: test_scan.py 프로젝트: fjarri/reikna
def test_small_scan_performance(thr, exclusive, seq_size):
    """
    Small problem sizes, big batches.
    """
    dtype = dtypes.normalize_type(numpy.complex128)
    shape = (500, 2, 2, 512)
    min_time = check_scan(
        thr, shape, dtype=dtype, axes=(-1,), exclusive=exclusive,
        measure_time=True, seq_size=seq_size)
    return min_time, helpers.product(shape) * dtype.itemsize
예제 #51
0
파일: vsize.py 프로젝트: mgolub2/reikna
def _group_dimensions(vdim, virtual_shape, adim, available_shape):
    """
    ``vdim`` and ``adim`` are used for the absolute addressing of dimensions during recursive calls.
    """
    if len(virtual_shape) == 1 and virtual_shape[0] == 1:
        return [(vdim,)], [(adim,)]

    if len(virtual_shape) == 0:
        return [], []

    if virtual_shape[0] == 1:
        v_remainder, a_remainder = _group_dimensions(
            vdim + 1, virtual_shape[1:], adim, available_shape)
        return [(vdim,) + v_remainder[0]] + v_remainder[1:], a_remainder

    vdim_group = 1 # number of currently grouped virtual dimensions
    adim_group = 1 # number of currently grouped available dimensions

    while 1:
        # If we have more elements in the virtual group than there is in the available group,
        # extend the available group by one dimension.
        if product(virtual_shape[:vdim_group]) > product(available_shape[:adim_group]):
            adim_group += 1
            continue

        # If the remaining available dimensions cannot accommodate the remaining virtual dimensions,
        # we try to fit one more virtual dimension in the virtual group.
        if product(virtual_shape[vdim_group:]) > product(available_shape[adim_group:]):
            vdim_group += 1
            continue

        # If we are here, it means that:
        # 1) the current available group can accommodate the current virtual group;
        # 2) the remaining available dimensions can accommodate the remaining virtual dimensions.
        # This means we can make a recursive call now.
        v_res = tuple(range(vdim, vdim + vdim_group))
        a_res = tuple(range(adim, adim + adim_group))
        v_remainder, a_remainder = _group_dimensions(
            vdim + vdim_group, virtual_shape[vdim_group:],
            adim + adim_group, available_shape[adim_group:])
        return [v_res] + v_remainder, [a_res] + a_remainder
예제 #52
0
파일: vsize.py 프로젝트: fjarri/reikna
    def __init__(self, virtual_shape, available_shape):
        self.real_dims = {}
        self.real_strides = {}
        self.virtual_strides = {}
        self.major_vdims = {}
        self.bounding_shape = tuple()
        self.skip_thresholds = []

        v_groups, a_groups = group_dimensions(virtual_shape, available_shape)

        for v_group, a_group in zip(v_groups, a_groups):
            virtual_subshape = virtual_shape[v_group[0]:v_group[-1]+1]
            virtual_subsize = product(virtual_subshape)

            bounding_subshape = find_bounding_shape(
                virtual_subsize,
                available_shape[a_group[0]:a_group[-1]+1])

            self.bounding_shape += bounding_subshape

            if virtual_subsize < product(bounding_subshape):
                strides = [(adim, product(bounding_subshape[:i])) for i, adim in enumerate(a_group)]
                self.skip_thresholds.append((virtual_subsize, strides))

            for vdim in v_group:
                self.real_dims[vdim] = a_group
                self.real_strides[vdim] = tuple(
                    product(self.bounding_shape[a_group[0]:adim]) for adim in a_group)
                self.virtual_strides[vdim] = product(virtual_shape[v_group[0]:vdim])

                # The major virtual dimension (the one that does not require
                # modulus operation when extracting its index from the flat index)
                # is the last non-trivial one (not of size 1).
                # Modulus will not be optimized away by the compiler,
                # but we know that all threads outside of the virtual group will be
                # filtered out by VIRTUAL_SKIP_THREADS.
                for major_vdim in _range(len(v_group) - 1, -1, -1):
                    if virtual_shape[v_group[major_vdim]] > 1:
                        break

                self.major_vdims[vdim] = v_group[major_vdim]
예제 #53
0
파일: vsize.py 프로젝트: fjarri/reikna
def _group_dimensions(vdim, virtual_shape, adim, available_shape):
    """
    ``vdim`` and ``adim`` are used for the absolute addressing of dimensions during recursive calls.
    """
    if len(virtual_shape) == 0:
        return [], []

    vdim_group = 1 # number of currently grouped virtual dimensions
    adim_group = 1 # number of currently grouped available dimensions

    while 1:
        # If we have more elements in the virtual group than there is in the available group,
        # extend the available group by one dimension.
        if product(virtual_shape[:vdim_group]) > product(available_shape[:adim_group]):
            adim_group += 1
            continue

        # If the remaining available dimensions cannot accommodate the remaining virtual dimensions,
        # we try to fit one more virtual dimension in the virtual group.
        if product(virtual_shape[vdim_group:]) > product(available_shape[adim_group:]):
            vdim_group += 1
            continue

        # If we are here, it means that:
        # 1) the current available group can accommodate the current virtual group;
        # 2) the remaining available dimensions can accommodate the remaining virtual dimensions.
        # This means we can make a recursive call now.

        # Attach any following trivial virtual dimensions (of size 1) to this group
        # This will help to avoid unassigned trivial dimensions with no real dimensions left.
        while vdim_group < len(virtual_shape) and virtual_shape[vdim_group] == 1:
            vdim_group += 1

        v_res = tuple(range(vdim, vdim + vdim_group))
        a_res = tuple(range(adim, adim + adim_group))

        v_remainder, a_remainder = _group_dimensions(
            vdim + vdim_group, virtual_shape[vdim_group:],
            adim + adim_group, available_shape[adim_group:])
        return [v_res] + v_remainder, [a_res] + a_remainder
예제 #54
0
    def try_create(cls, global_size, local_size, max_num_groups, max_work_item_sizes):
        """
        This method is used to filter working combinations of parameters
        from the cartesian product of all possible ones.
        Returns ``None`` if the parameters are not compatible.
        """
        if len(max_num_groups) != len(max_work_item_sizes):
            return None

        if local_size is not None:
            if len(local_size) > len(global_size):
                return None
            else:
                # we need local size and global size of the same length
                local_size = local_size + (1,) * (len(global_size) - len(local_size))

            if product(local_size) > product(max_work_item_sizes):
                return None

            bounding_global_size = [
                ls * min_blocks(gs, ls) for gs, ls
                in zip(global_size, local_size)]

            if product(bounding_global_size) > product(max_num_groups):
                return None

        else:
            if product(global_size) > product(max_num_groups):
                return None

        return cls(global_size, local_size, max_num_groups, max_work_item_sizes)