예제 #1
0
    def __init__(self, arr_t, order=2, axes=None):
        tr_elems = norm_const(arr_t, order)
        out_dtype = tr_elems.output.dtype

        rd = Reduce(Type(out_dtype, arr_t.shape), predicate_sum(out_dtype), axes=axes)

        res_t = rd.parameter.output
        tr_sum = norm_const(res_t, 1. / order)

        rd.parameter.input.connect(tr_elems, tr_elems.output, input_prime=tr_elems.input)
        rd.parameter.output.connect(tr_sum, tr_sum.input, output_prime=tr_sum.output)

        self._rd = rd

        Computation.__init__(self, [
            Parameter('output', Annotation(res_t, 'o')),
            Parameter('input', Annotation(arr_t, 'i'))])
예제 #2
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_click_order))

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                VSIZE_T sample_idx = ${idxs[0]};
                VSIZE_T order = ${idxs[1]} + 1;

                ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0);
                for (VSIZE_T i = 0; i < ${modes}; i++) {
                    ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i);
                    ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i);
                    ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                    ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y));

                    if (i >= order) {
                        result = ${mul_cc}(result, np);
                    }
                    else {
                        ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y);
                        result = ${mul_cc}(result, cp);
                    }
                }

                ${output.store_same}(result.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                modes=self._system.modes,
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
예제 #3
0
def test_nondefault_function(thr):

    shape = (100, 100)
    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)
    b_ref = a.sum(0)

    predicate = Predicate(
        Snippet.create(lambda v1, v2: "return ${v1} + ${v2};"), 0)

    rd = Reduce(a_dev, predicate, axes=(0, ))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
예제 #4
0
파일: lwe_gpu.py 프로젝트: stjordanis/nufhe
    def _build_plan(self, plan_factory, device_params, output, matrix, vector):
        plan = plan_factory()

        summation = Reduce(matrix, predicate_sum(matrix.dtype), axes=(len(matrix.shape)-1,))

        mul_vec = Transformation([
            Parameter('output', Annotation(matrix, 'o')),
            Parameter('matrix', Annotation(matrix, 'i')),
            Parameter('vector', Annotation(vector, 'i'))],
            """
            ${output.store_same}(${mul}(${matrix.load_same}, ${vector.load_idx}(${idxs[-1]})));
            """,
            render_kwds=dict(mul=functions.mul(matrix.dtype, vector.dtype)),
            connectors=['output', 'matrix'])

        summation.parameter.input.connect(
            mul_vec, mul_vec.output, matrix=mul_vec.matrix, vector=mul_vec.vector)

        plan.computation_call(summation, output, matrix, vector)

        return plan
예제 #5
0
파일: lwe_gpu.py 프로젝트: stjordanis/nufhe
    def _build_plan(
            self, plan_factory, device_params,
            ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b):

        plan = plan_factory()

        extracted_n, t, base, inner_n = ks_a.shape

        mean = Reduce(noises_b, predicate_sum(noises_b.dtype))
        norm = transformations.div_const(mean.parameter.output, numpy.prod(noises_b.shape))
        mean.parameter.output.connect(norm, norm.input, mean=norm.output)

        noises_b_mean = plan.temp_array_like(mean.parameter.mean)

        mul_key = MatrixMulVector(noises_a)
        b_term = plan.temp_array_like(mul_key.parameter.output)

        build_keyswitch = PureParallel([
            Parameter('ks_a', Annotation(ks_a, 'o')),
            Parameter('ks_b', Annotation(ks_b, 'o')),
            Parameter('ks_cv', Annotation(ks_cv, 'o')),
            Parameter('in_key', Annotation(in_key, 'i')),
            Parameter('b_term', Annotation(b_term, 'i')),
            Parameter('noises_a', Annotation(noises_a, 'i')),
            Parameter('noises_b', Annotation(noises_b, 'i')),
            Parameter('noises_b_mean', Annotation(noises_b_mean, 'i'))],
            Snippet(
                TEMPLATE.get_def("make_lwe_keyswitch_key"),
                render_kwds=dict(
                    log2_base=self._log2_base, output_size=self._output_size,
                    double_to_t32=double_to_t32_module, noise=self._noise)),
            guiding_array="ks_b")

        plan.computation_call(mean, noises_b_mean, noises_b)
        plan.computation_call(mul_key, b_term, noises_a, out_key)
        plan.computation_call(
            build_keyswitch,
            ks_a, ks_b, ks_cv, in_key, b_term, noises_a, noises_b, noises_b_mean)

        return plan
예제 #6
0
def test_structure_type(thr):

    shape = (100, 100)
    dtype = dtypes.align(
        numpy.dtype([('i1', numpy.uint32),
                     ('nested', numpy.dtype([
                         ('v', numpy.uint64),
                     ])), ('i2', numpy.uint32)]))

    a = get_test_array(shape, dtype)
    a_dev = thr.to_device(a)

    # Have to construct the resulting array manually,
    # since numpy cannot reduce arrays with struct dtypes.
    b_ref = numpy.empty(100, dtype)
    b_ref['i1'] = a['i1'].sum(0)
    b_ref['nested']['v'] = a['nested']['v'].sum(0)
    b_ref['i2'] = a['i2'].sum(0)

    predicate = Predicate(
        Snippet.create(lambda v1, v2: """
            ${ctype} result = ${v1};
            result.i1 += ${v2}.i1;
            result.nested.v += ${v2}.nested.v;
            result.i2 += ${v2}.i2;
            return result;
            """,
                       render_kwds=dict(ctype=dtypes.ctype_module(dtype))),
        numpy.zeros(1, dtype)[0])

    rd = Reduce(a_dev, predicate, axes=(0, ))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)
    b_res = b_dev.get()

    assert diff_is_negligible(b_res, b_ref)
예제 #7
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_moment))

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                VSIZE_T sample_idx = ${idxs[0]};
                VSIZE_T order = ${idxs[1]};

                ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0);
                for (VSIZE_T i = 0; i <= order; i++) {
                    ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i);
                    ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i);
                    ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                    t.x -= ${ordering};
                    result = ${mul_cc}(result, t);
                }
                ${output.store_same}(result.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                ordering=ordering(self._representation),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
예제 #8
0
# Create LUT and stringify into preamble of map kernel
LUT = np.zeros(256, np.int32)
for b in xrange(8):
    LUT[(np.arange(256) & (1 << b)) != 0] += 1
strLUT = "constant int LUT[256] = {" + ",".join(map(str, LUT)) + "};\n"
byte_to_count = Transformation([
    Parameter('output', Annotation(Type(np.int32, (1, )), 'o')),
    Parameter('input', Annotation(Type(np.uint8, (1, )), 'i'))
], strLUT + """
        ${output.store_same}(LUT[${input.load_same}]);
    """)

predicate = Predicate(
    Snippet.create(lambda v1, v2: """return ${v1} + ${v2}"""), np.int32(0))

sum_bits_reduction = Reduce(byte_to_count.output, predicate)
sum_bits_reduction.parameter.input.connect(byte_to_count,
                                           byte_to_count.output,
                                           new_input=byte_to_count.input)
sum_bits = sum_bits_reduction.compile(thr)
#sum_byte_count = ReductionKernel(cx, np.int32, neutral="0",
#                    reduce_expr="a+b", map_expr="LUT[bytes[i]]",
#                    arguments="__global unsigned char *bytes",
#                    preamble=strLUT)
#def count_bits(img):
#    return sum_byte_count(img).get().item()
#
#pixel_inds = GenericScanKernel(cx, np.int32,
#                    arguments="__global unsigned char *bytes, "
#                              "int image_w, "
#                              "__global int *pixels",
예제 #9
0
# Test array
arr = numpy.random.randint(0, 10**6, 20000)

# A transformation that creates initial minmax structures for the given array of integers
to_mmc = Transformation([
    Parameter('output', Annotation(Type(mmc_dtype, arr.shape), 'o')),
    Parameter('input', Annotation(arr, 'i'))
], """
    ${output.ctype} res;
    res.cur_min = ${input.load_same};
    res.cur_max = ${input.load_same};
    ${output.store_same}(res);
    """)

# Create the reduction computation and attach the transformation above to its input.
reduction = Reduce(to_mmc.output, predicate)
reduction.parameter.input.connect(to_mmc,
                                  to_mmc.output,
                                  new_input=to_mmc.input)
creduction = reduction.compile(thr)

# Run the computation
arr_dev = thr.to_device(arr)
res_dev = thr.empty_like(reduction.parameter.output)
creduction(res_dev, arr_dev)
minmax = res_dev.get()

assert minmax["cur_min"] == arr.min()
assert minmax["cur_max"] == arr.max()
예제 #10
0
def solve(nufft,gy, solver=None,  maxiter=30, *args, **kwargs):
    """
    The solve function of NUFFT_hsa.
    The current version supports solvers = 'cg' or 'L1TVOLS'. 
    
    :param nufft: NUFFT_hsa object
    :param y: (M,) or (M, batch) array, non-uniform data. If batch is provided, 'cg' and 'L1TVOLS' returns different image shape.
    :type y: numpy.complex64 reikna array
    :return: x: Nd or Nd + (batch, ) image. L1TVOLS always returns Nd. 'cg' returns Nd + (batch, ) in batch mode. 
    :rtype: x: reikna array, complex64. 
    """
    # define the reduction kernel on the device
#     if None ==  solver:
#         solver  =   'cg'
    if 'L1TVLAD' == solver:
        x2=L1TVLAD(nufft, gy, maxiter=maxiter, *args, **kwargs  )
#         x2 = nufft.thr.copy_array(nufft.x_Nd)
        return x2
    elif 'L1TVOLS' == solver:
        x2=L1TVOLS(nufft, gy, maxiter=maxiter, *args, **kwargs  )
#         x2 = nufft.thr.copy_array(nufft.x_Nd)
        return x2
    elif 'dc'   ==  solver:
         """
         Density compensation method
         nufft.st['W'] will be computed if doesn't exist
         If nufft.st['W'] exist then x2 = nufft.adjoint(nufft.st['W']*y)
         input: 
             y: (M,) array
         output:
             x2: Nd array
         """
         print(solver, ":density compensation method. I won't recommend it as the GPU version is not needed! Try the CPU version")

#          nufft.st['W'] = nufft._pipe_density(maxiter=maxiter,*args, **kwargs)
#  
#          x2 = nufft.adjoint(nufft.st['W']*gy)
         return x2
#             return gx        
    elif 'cg' == solver:
 
        from reikna.algorithms import Reduce, Predicate, predicate_sum
         
        nufft.reduce_sum = Reduce(numpy.zeros(nufft.Kd, dtype = nufft.dtype), predicate_sum(dtype)).compile(nufft.thr)      
#         nufft.reduce_sum  = nufft.reduce_sum.compile(nufft.thr)        
         
        
        # update: b = spH * gy         
        b = nufft._y2k_device(gy)
        
        # Initialize x = b
        x   =   nufft.thr.copy_array( b)
        rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0+0.0j)
        nufft.reduce_sum(rsold, x)
#         print('x',rsold) 
         
        # initialize r = b - A * x
        r   =   nufft.thr.empty_like( b)
        
#         r.fill(0.0 + 0.0j) 
        y_tmp = nufft._k2y_device(x)
        
        Ax = nufft._y2k_device(y_tmp)
        
        del y_tmp
        rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0 + 0.0j)
        nufft.reduce_sum(rsold, Ax)
#         print('Ax',rsold) 
        nufft.prg.cAddVec(b, - Ax, r , local_size=None, global_size = int(nufft.batch * nufft.Kdprod))
        
#         nufft.thr.synchronize()
        # p = r
        p   =   nufft.thr.copy_array(r)
        
        # rsold = r' * r
        tmp_array = nufft.thr.empty_like( r)
#         tmp_array.fill(0.0 + 0.0j)
        nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#         nufft.thr.synchronize()
        rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0 + 0.0j)
        nufft.reduce_sum(rsold, tmp_array)
 
        # allocate Ap
#         Ap  =   nufft.thr.empty_like( b)     
 
        rsnew = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsnew.fill(0.0 + 0.0j)
        tmp_sum = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         tmp_sum.fill(0.0 + 0.0j)
        for pp in range(0, maxiter):
            
            tmp_p = nufft._k2y_device(p)
            Ap = nufft._y2k_device(tmp_p)
            del tmp_p
#             alpha = rs_old/(p'*Ap)
            nufft.prg.cMultiplyConjVec(p, Ap, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            nufft.reduce_sum(tmp_sum, tmp_array)
            
            alpha = rsold / tmp_sum
#             alpha_cpu = alpha.get()
#             if numpy.isnan(alpha_cpu):
#                 alpha_cpu = 0 # avoid singularity
                
#             print(tmp_sum, alpha, rsold)
#             print(pp,rsold , alpha, numpy.sum(tmp_array.get()) )
            # x = x + alpha*p
            p2 = nufft.thr.copy_array(p)
            
            nufft.prg.cMultiplyScalar(alpha.get(), p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
#             nufft.prg.cAddVec(x, alpha, local_size=None, global_size=int(nufft.Kdprod))
            x += p2
 
            # r = r - alpha * Ap
            p2= nufft.thr.copy_array(Ap)
#             nufft.thr.synchronize()
            nufft.prg.cMultiplyScalar(alpha.get(), p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            r -= p2
#             print(pp, numpy.sum(x.get()), numpy.sum(r.get()))
            # rs_new = r'*r
             
            nufft.prg.cMultiplyConjVec(r,    r,  tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            nufft.reduce_sum(rsnew, tmp_array)        
             
            # tmp_sum = p = r + (rs_new/rs_old)*p
            beta = rsnew/rsold
#             beta_cpu = beta.get()
#             if numpy.isnan(beta_cpu):
#                 beta_cpu = 0
#             print(beta, rsnew, rsold)
            p2= nufft.thr.copy_array(p)
            nufft.prg.cMultiplyScalar(beta.get(),   p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            nufft.prg.cAddVec(r, p2, p, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            p = r + p2
             
            rsold =nufft.thr.copy_array( rsnew)
#             nufft.thr.synchronize()
        # end of iteration    
         
        # copy result to k_Kd2
#         nufft.k_Kd2 = nufft.thr.copy_array(x)
         
        # inverse FFT: k_Kd2 -> x_Nd
        x2 = nufft._k2xx_device(x) # x is the solved k space
        
        # rescale the SnGPUArray
        # x2 /= nufft.volume['gpu_sense2']
#         x3 = nufft.x2s(x2) # combine multi-coil to single-coil
        try:
            x2 /= nufft.volume['SnGPUArray']
        except:
            
            nufft.prg.cTensorMultiply(numpy.uint32(nufft.batch), 
                                    numpy.uint32(nufft.tSN['Tdims']),
                                    nufft.tSN['Td'],
                                    nufft.tSN['Td_elements'],
                                    nufft.tSN['invTd_elements'],
                                    nufft.tSN['tensor_sn'], 
                                    x2, 
                                    numpy.uint32(1), # division, 1 is true
                                    local_size = None, global_size = int(nufft.batch*nufft.Ndprod))
         
        return x2
예제 #11
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        samples, modes = alpha.shape

        for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1))

        prepared_state = plan.temp_array_like(alpha)

        plan.kernel_call(
            TEMPLATE.get_def("compound_click_probability_prepare"),
            [prepared_state, alpha, beta],
            kernel_name="compound_click_probability_prepare",
            global_size=alpha.shape,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                ))

        # Block size is limited by the amount of available local memory.
        # In some OpenCL implementations the number reported cannot actually be fully used
        # (because it's used by kernel arguments), so we're padding it a little.
        local_mem_size = device_params.local_mem_size
        max_elems = (local_mem_size - 256) // alpha.dtype.itemsize
        block_size = 2**helpers.log2(max_elems)

        # No reason to have block size larger than the number of modes
        block_size = min(block_size, helpers.bounding_power_of_2(modes))

        products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size)
        products = plan.temp_array_like(for_reduction)

        read_size = min(block_size, device_params.max_work_group_size)

        while read_size > 1:

            full_steps = modes // block_size
            remainder_size = modes % block_size

            try:
                plan.kernel_call(
                    TEMPLATE.get_def("compound_click_probability_aggregate"),
                    [products, prepared_state],
                    kernel_name="compound_click_probability_aggregate",
                    global_size=products_gsize,
                    local_size=(1, read_size,),
                    render_kwds=dict(
                        block_size=block_size,
                        read_size=read_size,
                        full_steps=full_steps,
                        remainder_size=remainder_size,
                        output_size=self._max_total_clicks + 1,
                        mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                        add_cc=functions.add(alpha.dtype, alpha.dtype),
                        polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)),
                        modes=self._system.modes,
                        max_total_clicks=self._max_total_clicks,
                        ))

            except OutOfResourcesError:
                read_size //= 2

            break

        reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,))

        temp = plan.temp_array_like(reduction.parameter.output)

        plan.computation_call(reduction, temp, products)

        fft = FFT(temp)
        real_trf = Transformation([
            Parameter('output', Annotation(output, 'o')),
            Parameter('input', Annotation(temp, 'i')),
            ],
            """
                ${input.ctype} val = ${input.load_same};
                ${output.store_same}(val.x);
                """)
        fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output)

        plan.computation_call(fft, output, temp, True)

        return plan