Пример #1
0
def test_summation(thr):

    perf_size = 2**22
    dtype = dtypes.normalize_type(numpy.int64)

    a = get_test_array(perf_size, dtype)
    a_dev = thr.to_device(a)

    rd = Reduce(a, predicate_sum(dtype))

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = numpy.array([a.sum()], dtype)

    rdc = rd.compile(thr)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        rdc(b_dev, a_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(b_dev.get(), b_ref)

    return min(times), perf_size * dtype.itemsize
Пример #2
0
def test_summation(thr):

    perf_size = 2 ** 22
    dtype = dtypes.normalize_type(numpy.int64)

    a = get_test_array(perf_size, dtype)
    a_dev = thr.to_device(a)

    rd = Reduce(a, predicate_sum(dtype))

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = numpy.array([a.sum()], dtype)

    rdc = rd.compile(thr)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        rdc(b_dev, a_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(b_dev.get(), b_ref)

    return min(times), perf_size * dtype.itemsize
Пример #3
0
def test_structure_type(thr):

    shape = (100, 100)
    dtype = dtypes.align(
        numpy.dtype([('i1', numpy.uint32),
                     ('nested', numpy.dtype([
                         ('v', numpy.uint64),
                     ])), ('i2', numpy.uint32)]))

    a = get_test_array(shape, dtype)
    a_dev = thr.to_device(a)

    # Have to construct the resulting array manually,
    # since numpy cannot reduce arrays with struct dtypes.
    b_ref = numpy.empty(100, dtype)
    b_ref['i1'] = a['i1'].sum(0)
    b_ref['nested']['v'] = a['nested']['v'].sum(0)
    b_ref['i2'] = a['i2'].sum(0)

    predicate = Predicate(
        Snippet.create(lambda v1, v2: """
            ${ctype} result = ${v1};
            result.i1 += ${v2}.i1;
            result.nested.v += ${v2}.nested.v;
            result.i2 += ${v2}.i2;
            return result;
            """,
                       render_kwds=dict(ctype=dtypes.ctype_module(dtype))),
        numpy.zeros(1, dtype)[0])

    rd = Reduce(a_dev, predicate, axes=(0, ))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)
    b_res = b_dev.get()

    # Array.get() runs numpy.lib.stride_tricks.as_strided() on the array,
    # which adds dummy variables instead of custom offsets (and our `dtype` has them),
    # making the result dtype different, and failing the test.
    # For now we will just note the difference and convert the result
    # back to the original dtype (they are still compatible).
    # When the behavior changes, the test will start to fail and we will notice.
    # See inducer/compyte issue #26.
    wrong_dtype = b_res.dtype != b_dev.dtype

    b_res = b_res.astype(dtype)
    assert diff_is_negligible(b_res, b_ref)

    if wrong_dtype:
        pytest.xfail("as_strided() still corrupts the datatype")
    else:
        pytest.fail("as_strided() does not corrupt the datatype anymore, "
                    "we can remove the `astype()` now")
Пример #4
0
def test_normal(thr, shape, axis):

    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)

    rd = Reduce(a, predicate_sum(numpy.int64), axes=(axis,) if axis is not None else None)

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = a.sum(axis)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Пример #5
0
def test_nonsequential_axes(thr):

    shape = (50, 40, 30, 20)
    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)
    b_ref = a.sum(0).sum(1)  # sum over axes 0 and 2 of the initial array

    rd = Reduce(a_dev, predicate_sum(numpy.int64), axes=(0, 2))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Пример #6
0
def test_nonsequential_axes(thr):

    shape = (50, 40, 30, 20)
    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)
    b_ref = a.sum(0).sum(1) # sum over axes 0 and 2 of the initial array

    rd = Reduce(a_dev, predicate_sum(numpy.int64), axes=(0,2))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Пример #7
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, alpha.shape)

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                ${alpha.ctype} alpha = ${alpha.load_same};
                ${beta.ctype} beta = ${beta.load_same};
                ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y));
                ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y);
                ${output.store_same}(cp.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
Пример #8
0
    def _build_plan(self, plan_factory, device_params, output, matrix, vector):
        plan = plan_factory()

        summation = Reduce(matrix,
                           predicate_sum(matrix.dtype),
                           axes=(len(matrix.shape) - 1, ))

        mul_vec = Transformation(
            [
                Parameter('output', Annotation(matrix, 'o')),
                Parameter('matrix', Annotation(matrix, 'i')),
                Parameter('vector', Annotation(vector, 'i'))
            ],
            """
            ${output.store_same}(${mul}(${matrix.load_same}, ${vector.load_idx}(${idxs[-1]})));
            """,
            render_kwds=dict(mul=functions.mul(matrix.dtype, vector.dtype)),
            connectors=['output', 'matrix'])

        summation.parameter.input.connect(mul_vec,
                                          mul_vec.output,
                                          matrix=mul_vec.matrix,
                                          vector=mul_vec.vector)

        plan.computation_call(summation, output, matrix, vector)

        return plan
Пример #9
0
def test_normal(thr, shape, axis):

    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)

    rd = Reduce(a,
                predicate_sum(numpy.int64),
                axes=(axis, ) if axis is not None else None)

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = a.sum(axis)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Пример #10
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, alpha.shape)

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                ${alpha.ctype} alpha = ${alpha.load_same};
                ${beta.ctype} beta = ${beta.load_same};
                ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                ${output.store_same}(t.x - ${ordering});
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                ordering=ordering(self._representation),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
Пример #11
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        N = input_.shape[-1] * 2
        batch_shape = input_.shape[:-1]
        batch_size = helpers.product(batch_shape)

        coeffs1 = 4 * numpy.sin(2 * numpy.pi * numpy.arange(N // 2) / N)
        coeffs2 = 2 * numpy.cos(2 * numpy.pi * numpy.arange(N // 2) / N)

        c1_arr = plan.persistent_array(coeffs1)
        c2_arr = plan.persistent_array(coeffs2)

        multiply = get_multiply(input_)

        # re_X_1 = sum(x * coeffs2)

        t = plan.temp_array_like(input_)
        rd = Reduce(t,
                    predicate_sum(input_.dtype),
                    axes=(len(input_.shape) - 1, ))

        rd.parameter.input.connect(multiply,
                                   multiply.output,
                                   x=multiply.a,
                                   c2=multiply.b)

        re_X_0 = plan.temp_array_like(rd.parameter.output)
        plan.computation_call(rd, re_X_0, input_, c2_arr)

        # Y = numpy.fft.rfft(x * coeffs1)

        rfft = RFFT(input_, dont_store_last=True)
        rfft.parameter.input.connect(multiply,
                                     multiply.output,
                                     x=multiply.a,
                                     c1=multiply.b)

        Y = plan.temp_array_like(rfft.parameter.output)
        plan.computation_call(rfft, Y, input_, c1_arr)

        # Y *= -1j
        # Y[0] /= 2
        # Y[0] += re_X_1
        # res = numpy.cumsum(Y[:-1])

        prepare_prfft_scan = get_prepare_prfft_scan(Y)

        sc = Scan(Y, predicate_sum(Y.dtype), axes=(-1, ), exclusive=False)
        sc.parameter.input.connect(prepare_prfft_scan,
                                   prepare_prfft_scan.output,
                                   Y=prepare_prfft_scan.Y,
                                   re_X_0=prepare_prfft_scan.re_X_0)

        plan.computation_call(sc, output, Y, re_X_0)

        return plan
Пример #12
0
def test_nondefault_function(thr):

    shape = (100, 100)
    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)
    b_ref = a.sum(0)

    predicate = Predicate(
        Snippet.create(lambda v1, v2: "return ${v1} + ${v2};"), 0)

    rd = Reduce(a_dev, predicate, axes=(0, ))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Пример #13
0
def test_nondefault_function(thr):

    shape = (100, 100)
    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)
    b_ref = a.sum(0)

    predicate = Predicate(
        Snippet.create(lambda v1, v2: "return ${v1} + ${v2};"),
        0)

    rd = Reduce(a_dev, predicate, axes=(0,))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Пример #14
0
def test_structure_type(thr):

    shape = (100, 100)
    dtype = dtypes.align(numpy.dtype([
        ('i1', numpy.uint32),
        ('nested', numpy.dtype([
            ('v', numpy.uint64),
            ])),
        ('i2', numpy.uint32)
        ]))

    a = get_test_array(shape, dtype)
    a_dev = thr.to_device(a)

    # Have to construct the resulting array manually,
    # since numpy cannot reduce arrays with struct dtypes.
    b_ref = numpy.empty(100, dtype)
    b_ref['i1'] = a['i1'].sum(0)
    b_ref['nested']['v'] = a['nested']['v'].sum(0)
    b_ref['i2'] = a['i2'].sum(0)

    predicate = Predicate(
        Snippet.create(lambda v1, v2: """
            ${ctype} result = ${v1};
            result.i1 += ${v2}.i1;
            result.nested.v += ${v2}.nested.v;
            result.i2 += ${v2}.i2;
            return result;
            """,
            render_kwds=dict(
                ctype=dtypes.ctype_module(dtype))),
        numpy.zeros(1, dtype)[0])

    rd = Reduce(a_dev, predicate, axes=(0,))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)
    b_res = b_dev.get()

    assert diff_is_negligible(b_res, b_ref)
Пример #15
0
def test_structure_type(thr):

    shape = (100, 100)
    dtype = dtypes.align(
        numpy.dtype([('i1', numpy.uint32),
                     ('nested', numpy.dtype([
                         ('v', numpy.uint64),
                     ])), ('i2', numpy.uint32)]))

    a = get_test_array(shape, dtype)
    a_dev = thr.to_device(a)

    # Have to construct the resulting array manually,
    # since numpy cannot reduce arrays with struct dtypes.
    b_ref = numpy.empty(100, dtype)
    b_ref['i1'] = a['i1'].sum(0)
    b_ref['nested']['v'] = a['nested']['v'].sum(0)
    b_ref['i2'] = a['i2'].sum(0)

    predicate = Predicate(
        Snippet.create(lambda v1, v2: """
            ${ctype} result = ${v1};
            result.i1 += ${v2}.i1;
            result.nested.v += ${v2}.nested.v;
            result.i2 += ${v2}.i2;
            return result;
            """,
                       render_kwds=dict(ctype=dtypes.ctype_module(dtype))),
        numpy.zeros(1, dtype)[0])

    rd = Reduce(a_dev, predicate, axes=(0, ))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)
    b_res = b_dev.get()

    assert diff_is_negligible(b_res, b_ref)
Пример #16
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        N = input_.shape[-1] * 4
        batch_shape = input_.shape[:-1]
        batch_size = helpers.product(batch_shape)

        # The first element is unused
        coeffs = numpy.concatenate(
            [[0],
             1 / (4 * numpy.sin(2 * numpy.pi * numpy.arange(1, N // 2) / N))])
        coeffs_arr = plan.persistent_array(coeffs)

        prepare_iprfft_input = get_prepare_iprfft_input(input_)
        prepare_iprfft_output = get_prepare_iprfft_output(output)

        irfft = IRFFT(prepare_iprfft_input.Y)
        irfft.parameter.input.connect(prepare_iprfft_input,
                                      prepare_iprfft_input.Y,
                                      X=prepare_iprfft_input.X)
        irfft.parameter.output.connect(prepare_iprfft_output,
                                       prepare_iprfft_output.y,
                                       x=prepare_iprfft_output.x,
                                       x0=prepare_iprfft_output.x0,
                                       coeffs=prepare_iprfft_output.coeffs)

        real = Transformation([
            Parameter(
                'output',
                Annotation(Type(dtypes.real_for(input_.dtype), input_.shape),
                           'o')),
            Parameter('input', Annotation(input_, 'i')),
        ],
                              """
            ${output.store_same}((${input.load_same}).x);
            """,
                              connectors=['output'])

        rd_t = Type(output.dtype, input_.shape)
        rd = Reduce(rd_t,
                    predicate_sum(rd_t.dtype),
                    axes=(len(input_.shape) - 1, ))
        rd.parameter.input.connect(real, real.output, X=real.input)

        x0 = plan.temp_array_like(rd.parameter.output)

        plan.computation_call(rd, x0, input_)
        plan.computation_call(irfft, output, x0, coeffs_arr, input_)

        return plan
Пример #17
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_click_order))

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                VSIZE_T sample_idx = ${idxs[0]};
                VSIZE_T order = ${idxs[1]} + 1;

                ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0);
                for (VSIZE_T i = 0; i < ${modes}; i++) {
                    ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i);
                    ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i);
                    ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                    ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y));

                    if (i >= order) {
                        result = ${mul_cc}(result, np);
                    }
                    else {
                        ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y);
                        result = ${mul_cc}(result, cp);
                    }
                }

                ${output.store_same}(result.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                modes=self._system.modes,
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
Пример #18
0
    def __init__(self, arr_t, order=2, axes=None):
        tr_elems = norm_const(arr_t, order)
        out_dtype = tr_elems.output.dtype

        rd = Reduce(Type(out_dtype, arr_t.shape), predicate_sum(out_dtype), axes=axes)

        res_t = rd.parameter.output
        tr_sum = norm_const(res_t, 1. / order)

        rd.parameter.input.connect(tr_elems, tr_elems.output, input_prime=tr_elems.input)
        rd.parameter.output.connect(tr_sum, tr_sum.input, output_prime=tr_sum.output)

        self._rd = rd

        Computation.__init__(self, [
            Parameter('output', Annotation(res_t, 'o')),
            Parameter('input', Annotation(arr_t, 'i'))])
Пример #19
0
    def _build_plan(
            self, plan_factory, device_params,
            ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b):

        plan = plan_factory()

        extracted_n, t, base, inner_n = ks_a.shape

        mean = Reduce(noises_b, predicate_sum(noises_b.dtype))
        norm = transformations.div_const(mean.parameter.output, numpy.prod(noises_b.shape))
        mean.parameter.output.connect(norm, norm.input, mean=norm.output)

        noises_b_mean = plan.temp_array_like(mean.parameter.mean)

        mul_key = MatrixMulVector(noises_a)
        b_term = plan.temp_array_like(mul_key.parameter.output)

        build_keyswitch = PureParallel([
            Parameter('ks_a', Annotation(ks_a, 'o')),
            Parameter('ks_b', Annotation(ks_b, 'o')),
            Parameter('ks_cv', Annotation(ks_cv, 'o')),
            Parameter('in_key', Annotation(in_key, 'i')),
            Parameter('b_term', Annotation(b_term, 'i')),
            Parameter('noises_a', Annotation(noises_a, 'i')),
            Parameter('noises_b', Annotation(noises_b, 'i')),
            Parameter('noises_b_mean', Annotation(noises_b_mean, 'i'))],
            Snippet(
                TEMPLATE.get_def("make_lwe_keyswitch_key"),
                render_kwds=dict(
                    log2_base=self._log2_base, output_size=self._output_size,
                    double_to_t32=double_to_t32_module, noise=self._noise)),
            guiding_array="ks_b")

        plan.computation_call(mean, noises_b_mean, noises_b)
        plan.computation_call(mul_key, b_term, noises_a, out_key)
        plan.computation_call(
            build_keyswitch,
            ks_a, ks_b, ks_cv, in_key, b_term, noises_a, noises_b, noises_b_mean)

        return plan
Пример #20
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_moment))

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                VSIZE_T sample_idx = ${idxs[0]};
                VSIZE_T order = ${idxs[1]};

                ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0);
                for (VSIZE_T i = 0; i <= order; i++) {
                    ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i);
                    ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i);
                    ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                    t.x -= ${ordering};
                    result = ${mul_cc}(result, t);
                }
                ${output.store_same}(result.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                ordering=ordering(self._representation),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
Пример #21
0
# Test array
arr = numpy.random.randint(0, 10**6, 20000)


# A transformation that creates initial minmax structures for the given array of integers
to_mmc = Transformation(
    [Parameter('output', Annotation(Type(mmc_dtype, arr.shape), 'o')),
    Parameter('input', Annotation(arr, 'i'))],
    """
    ${output.ctype} res;
    res.cur_min = ${input.load_same};
    res.cur_max = ${input.load_same};
    ${output.store_same}(res);
    """)


# Create the reduction computation and attach the transformation above to its input.
reduction = Reduce(to_mmc.output, predicate)
reduction.parameter.input.connect(to_mmc, to_mmc.output, new_input=to_mmc.input)
creduction = reduction.compile(thr)


# Run the computation
arr_dev = thr.to_device(arr)
res_dev = thr.empty_like(reduction.parameter.output)
creduction(res_dev, arr_dev)
minmax = res_dev.get()

assert minmax["cur_min"] == arr.min()
assert minmax["cur_max"] == arr.max()
Пример #22
0
# Test array
arr = numpy.random.randint(0, 10**6, 20000)

# A transformation that creates initial minmax structures for the given array of integers
to_mmc = Transformation([
    Parameter('output', Annotation(Type(mmc_dtype, arr.shape), 'o')),
    Parameter('input', Annotation(arr, 'i'))
], """
    ${output.ctype} res;
    res.cur_min = ${input.load_same};
    res.cur_max = ${input.load_same};
    ${output.store_same}(res);
    """)

# Create the reduction computation and attach the transformation above to its input.
reduction = Reduce(to_mmc.output, predicate)
reduction.parameter.input.connect(to_mmc,
                                  to_mmc.output,
                                  new_input=to_mmc.input)
creduction = reduction.compile(thr)

# Run the computation
arr_dev = thr.to_device(arr)
res_dev = thr.empty_like(reduction.parameter.output)
creduction(res_dev, arr_dev)
minmax = res_dev.get()

assert minmax["cur_min"] == arr.min()
assert minmax["cur_max"] == arr.max()
Пример #23
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        samples, modes = alpha.shape

        for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1))

        prepared_state = plan.temp_array_like(alpha)

        plan.kernel_call(
            TEMPLATE.get_def("compound_click_probability_prepare"),
            [prepared_state, alpha, beta],
            kernel_name="compound_click_probability_prepare",
            global_size=alpha.shape,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                ))

        # Block size is limited by the amount of available local memory.
        # In some OpenCL implementations the number reported cannot actually be fully used
        # (because it's used by kernel arguments), so we're padding it a little.
        local_mem_size = device_params.local_mem_size
        max_elems = (local_mem_size - 256) // alpha.dtype.itemsize
        block_size = 2**helpers.log2(max_elems)

        # No reason to have block size larger than the number of modes
        block_size = min(block_size, helpers.bounding_power_of_2(modes))

        products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size)
        products = plan.temp_array_like(for_reduction)

        read_size = min(block_size, device_params.max_work_group_size)

        while read_size > 1:

            full_steps = modes // block_size
            remainder_size = modes % block_size

            try:
                plan.kernel_call(
                    TEMPLATE.get_def("compound_click_probability_aggregate"),
                    [products, prepared_state],
                    kernel_name="compound_click_probability_aggregate",
                    global_size=products_gsize,
                    local_size=(1, read_size,),
                    render_kwds=dict(
                        block_size=block_size,
                        read_size=read_size,
                        full_steps=full_steps,
                        remainder_size=remainder_size,
                        output_size=self._max_total_clicks + 1,
                        mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                        add_cc=functions.add(alpha.dtype, alpha.dtype),
                        polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)),
                        modes=self._system.modes,
                        max_total_clicks=self._max_total_clicks,
                        ))

            except OutOfResourcesError:
                read_size //= 2

            break

        reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,))

        temp = plan.temp_array_like(reduction.parameter.output)

        plan.computation_call(reduction, temp, products)

        fft = FFT(temp)
        real_trf = Transformation([
            Parameter('output', Annotation(output, 'o')),
            Parameter('input', Annotation(temp, 'i')),
            ],
            """
                ${input.ctype} val = ${input.load_same};
                ${output.store_same}(val.x);
                """)
        fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output)

        plan.computation_call(fft, output, temp, True)

        return plan
Пример #24
0
def solve(nufft,gy, solver=None,  maxiter=30, *args, **kwargs):
    """
    The solve function of NUFFT_hsa.
    The current version supports solvers = 'cg' or 'L1TVOLS'. 
    
    :param nufft: NUFFT_hsa object
    :param y: (M,) or (M, batch) array, non-uniform data. If batch is provided, 'cg' and 'L1TVOLS' returns different image shape.
    :type y: numpy.complex64 reikna array
    :return: x: Nd or Nd + (batch, ) image. L1TVOLS always returns Nd. 'cg' returns Nd + (batch, ) in batch mode. 
    :rtype: x: reikna array, complex64. 
    """
    # define the reduction kernel on the device
#     if None ==  solver:
#         solver  =   'cg'
    if 'L1TVLAD' == solver:
        x2=L1TVLAD(nufft, gy, maxiter=maxiter, *args, **kwargs  )
#         x2 = nufft.thr.copy_array(nufft.x_Nd)
        return x2
    elif 'L1TVOLS' == solver:
        x2=L1TVOLS(nufft, gy, maxiter=maxiter, *args, **kwargs  )
#         x2 = nufft.thr.copy_array(nufft.x_Nd)
        return x2
    elif 'dc'   ==  solver:
         """
         Density compensation method
         nufft.st['W'] will be computed if doesn't exist
         If nufft.st['W'] exist then x2 = nufft.adjoint(nufft.st['W']*y)
         input: 
             y: (M,) array
         output:
             x2: Nd array
         """
         print(solver, ":density compensation method. I won't recommend it as the GPU version is not needed! Try the CPU version")

#          nufft.st['W'] = nufft._pipe_density(maxiter=maxiter,*args, **kwargs)
#  
#          x2 = nufft.adjoint(nufft.st['W']*gy)
         return x2
#             return gx        
    elif 'cg' == solver:
 
        from reikna.algorithms import Reduce, Predicate, predicate_sum
         
        nufft.reduce_sum = Reduce(numpy.zeros(nufft.Kd, dtype = nufft.dtype), predicate_sum(dtype)).compile(nufft.thr)      
#         nufft.reduce_sum  = nufft.reduce_sum.compile(nufft.thr)        
         
        
        # update: b = spH * gy         
        b = nufft._y2k_device(gy)
        
        # Initialize x = b
        x   =   nufft.thr.copy_array( b)
        rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0+0.0j)
        nufft.reduce_sum(rsold, x)
#         print('x',rsold) 
         
        # initialize r = b - A * x
        r   =   nufft.thr.empty_like( b)
        
#         r.fill(0.0 + 0.0j) 
        y_tmp = nufft._k2y_device(x)
        
        Ax = nufft._y2k_device(y_tmp)
        
        del y_tmp
        rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0 + 0.0j)
        nufft.reduce_sum(rsold, Ax)
#         print('Ax',rsold) 
        nufft.prg.cAddVec(b, - Ax, r , local_size=None, global_size = int(nufft.batch * nufft.Kdprod))
        
#         nufft.thr.synchronize()
        # p = r
        p   =   nufft.thr.copy_array(r)
        
        # rsold = r' * r
        tmp_array = nufft.thr.empty_like( r)
#         tmp_array.fill(0.0 + 0.0j)
        nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#         nufft.thr.synchronize()
        rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0 + 0.0j)
        nufft.reduce_sum(rsold, tmp_array)
 
        # allocate Ap
#         Ap  =   nufft.thr.empty_like( b)     
 
        rsnew = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsnew.fill(0.0 + 0.0j)
        tmp_sum = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         tmp_sum.fill(0.0 + 0.0j)
        for pp in range(0, maxiter):
            
            tmp_p = nufft._k2y_device(p)
            Ap = nufft._y2k_device(tmp_p)
            del tmp_p
#             alpha = rs_old/(p'*Ap)
            nufft.prg.cMultiplyConjVec(p, Ap, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            nufft.reduce_sum(tmp_sum, tmp_array)
            
            alpha = rsold / tmp_sum
#             alpha_cpu = alpha.get()
#             if numpy.isnan(alpha_cpu):
#                 alpha_cpu = 0 # avoid singularity
                
#             print(tmp_sum, alpha, rsold)
#             print(pp,rsold , alpha, numpy.sum(tmp_array.get()) )
            # x = x + alpha*p
            p2 = nufft.thr.copy_array(p)
            
            nufft.prg.cMultiplyScalar(alpha.get(), p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
#             nufft.prg.cAddVec(x, alpha, local_size=None, global_size=int(nufft.Kdprod))
            x += p2
 
            # r = r - alpha * Ap
            p2= nufft.thr.copy_array(Ap)
#             nufft.thr.synchronize()
            nufft.prg.cMultiplyScalar(alpha.get(), p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            r -= p2
#             print(pp, numpy.sum(x.get()), numpy.sum(r.get()))
            # rs_new = r'*r
             
            nufft.prg.cMultiplyConjVec(r,    r,  tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            nufft.reduce_sum(rsnew, tmp_array)        
             
            # tmp_sum = p = r + (rs_new/rs_old)*p
            beta = rsnew/rsold
#             beta_cpu = beta.get()
#             if numpy.isnan(beta_cpu):
#                 beta_cpu = 0
#             print(beta, rsnew, rsold)
            p2= nufft.thr.copy_array(p)
            nufft.prg.cMultiplyScalar(beta.get(),   p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            nufft.prg.cAddVec(r, p2, p, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            p = r + p2
             
            rsold =nufft.thr.copy_array( rsnew)
#             nufft.thr.synchronize()
        # end of iteration    
         
        # copy result to k_Kd2
#         nufft.k_Kd2 = nufft.thr.copy_array(x)
         
        # inverse FFT: k_Kd2 -> x_Nd
        x2 = nufft._k2xx_device(x) # x is the solved k space
        
        # rescale the SnGPUArray
        # x2 /= nufft.volume['gpu_sense2']
#         x3 = nufft.x2s(x2) # combine multi-coil to single-coil
        try:
            x2 /= nufft.volume['SnGPUArray']
        except:
            
            nufft.prg.cTensorMultiply(numpy.uint32(nufft.batch), 
                                    numpy.uint32(nufft.tSN['Tdims']),
                                    nufft.tSN['Td'],
                                    nufft.tSN['Td_elements'],
                                    nufft.tSN['invTd_elements'],
                                    nufft.tSN['tensor_sn'], 
                                    x2, 
                                    numpy.uint32(1), # division, 1 is true
                                    local_size = None, global_size = int(nufft.batch*nufft.Ndprod))
         
        return x2
Пример #25
0
# Create LUT and stringify into preamble of map kernel
LUT = np.zeros(256, np.int32)
for b in xrange(8):
    LUT[(np.arange(256) & (1 << b)) != 0] += 1
strLUT = "constant int LUT[256] = {" + ",".join(map(str, LUT)) + "};\n"
byte_to_count = Transformation([
    Parameter('output', Annotation(Type(np.int32, (1, )), 'o')),
    Parameter('input', Annotation(Type(np.uint8, (1, )), 'i'))
], strLUT + """
        ${output.store_same}(LUT[${input.load_same}]);
    """)

predicate = Predicate(
    Snippet.create(lambda v1, v2: """return ${v1} + ${v2}"""), np.int32(0))

sum_bits_reduction = Reduce(byte_to_count.output, predicate)
sum_bits_reduction.parameter.input.connect(byte_to_count,
                                           byte_to_count.output,
                                           new_input=byte_to_count.input)
sum_bits = sum_bits_reduction.compile(thr)
#sum_byte_count = ReductionKernel(cx, np.int32, neutral="0",
#                    reduce_expr="a+b", map_expr="LUT[bytes[i]]",
#                    arguments="__global unsigned char *bytes",
#                    preamble=strLUT)
#def count_bits(img):
#    return sum_byte_count(img).get().item()
#
#pixel_inds = GenericScanKernel(cx, np.int32,
#                    arguments="__global unsigned char *bytes, "
#                              "int image_w, "
#                              "__global int *pixels",
Пример #26
0
    LUT[(np.arange(256) & (1 << b)) != 0] += 1
strLUT = "constant int LUT[256] = {" + ",".join(map(str, LUT)) + "};\n"
byte_to_count = Transformation(
    [
        Parameter("output", Annotation(Type(np.int32, (1,)), "o")),
        Parameter("input", Annotation(Type(np.uint8, (1,)), "i")),
    ],
    strLUT
    + """
        ${output.store_same}(LUT[${input.load_same}]);
    """,
)

predicate = Predicate(Snippet.create(lambda v1, v2: """return ${v1} + ${v2}"""), np.int32(0))

sum_bits_reduction = Reduce(byte_to_count.output, predicate)
sum_bits_reduction.parameter.input.connect(byte_to_count, byte_to_count.output, new_input=byte_to_count.input)
sum_bits = sum_bits_reduction.compile(thr)
# sum_byte_count = ReductionKernel(cx, np.int32, neutral="0",
#                    reduce_expr="a+b", map_expr="LUT[bytes[i]]",
#                    arguments="__global unsigned char *bytes",
#                    preamble=strLUT)
# def count_bits(img):
#    return sum_byte_count(img).get().item()
#
# pixel_inds = GenericScanKernel(cx, np.int32,
#                    arguments="__global unsigned char *bytes, "
#                              "int image_w, "
#                              "__global int *pixels",
#                    # Keep count of pixels we have stored so far
#                    input_expr="LUT[bytes[i]]",