def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = input_.shape[-1] * 2 batch_shape = input_.shape[:-1] batch_size = helpers.product(batch_shape) coeffs1 = 4 * numpy.sin(2 * numpy.pi * numpy.arange(N // 2) / N) coeffs2 = 2 * numpy.cos(2 * numpy.pi * numpy.arange(N // 2) / N) c1_arr = plan.persistent_array(coeffs1) c2_arr = plan.persistent_array(coeffs2) multiply = get_multiply(input_) # re_X_1 = sum(x * coeffs2) t = plan.temp_array_like(input_) rd = Reduce(t, predicate_sum(input_.dtype), axes=(len(input_.shape) - 1, )) rd.parameter.input.connect(multiply, multiply.output, x=multiply.a, c2=multiply.b) re_X_0 = plan.temp_array_like(rd.parameter.output) plan.computation_call(rd, re_X_0, input_, c2_arr) # Y = numpy.fft.rfft(x * coeffs1) rfft = RFFT(input_, dont_store_last=True) rfft.parameter.input.connect(multiply, multiply.output, x=multiply.a, c1=multiply.b) Y = plan.temp_array_like(rfft.parameter.output) plan.computation_call(rfft, Y, input_, c1_arr) # Y *= -1j # Y[0] /= 2 # Y[0] += re_X_1 # res = numpy.cumsum(Y[:-1]) prepare_prfft_scan = get_prepare_prfft_scan(Y) sc = Scan(Y, predicate_sum(Y.dtype), axes=(-1, ), exclusive=False) sc.parameter.input.connect(prepare_prfft_scan, prepare_prfft_scan.output, Y=prepare_prfft_scan.Y, re_X_0=prepare_prfft_scan.re_X_0) plan.computation_call(sc, output, Y, re_X_0) return plan
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = input_.shape[-1] * 2 batch_shape = input_.shape[:-1] batch_size = helpers.product(batch_shape) coeffs1 = 4 * numpy.sin(2 * numpy.pi * numpy.arange(N//2) / N) coeffs2 = 2 * numpy.cos(2 * numpy.pi * numpy.arange(N//2) / N) c1_arr = plan.persistent_array(coeffs1) c2_arr = plan.persistent_array(coeffs2) multiply = get_multiply(input_) # re_X_1 = sum(x * coeffs2) t = plan.temp_array_like(input_) rd = Reduce(t, predicate_sum(input_.dtype), axes=(len(input_.shape)-1,)) rd.parameter.input.connect( multiply, multiply.output, x=multiply.a, c2=multiply.b) re_X_0 = plan.temp_array_like(rd.parameter.output) plan.computation_call(rd, re_X_0, input_, c2_arr) # Y = numpy.fft.rfft(x * coeffs1) rfft = RFFT(input_, dont_store_last=True) rfft.parameter.input.connect( multiply, multiply.output, x=multiply.a, c1=multiply.b) Y = plan.temp_array_like(rfft.parameter.output) plan.computation_call(rfft, Y, input_, c1_arr) # Y *= -1j # Y[0] /= 2 # Y[0] += re_X_1 # res = numpy.cumsum(Y[:-1]) prepare_prfft_scan = get_prepare_prfft_scan(Y) sc = Scan(Y, predicate_sum(Y.dtype), axes=(-1,), exclusive=False) sc.parameter.input.connect( prepare_prfft_scan, prepare_prfft_scan.output, Y=prepare_prfft_scan.Y, re_X_0=prepare_prfft_scan.re_X_0) plan.computation_call(sc, output, Y, re_X_0) return plan
def test_summation(thr): perf_size = 2**22 dtype = dtypes.normalize_type(numpy.int64) a = get_test_array(perf_size, dtype) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(dtype)) b_dev = thr.empty_like(rd.parameter.output) b_ref = numpy.array([a.sum()], dtype) rdc = rd.compile(thr) attempts = 10 times = [] for i in range(attempts): t1 = time.time() rdc(b_dev, a_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(b_dev.get(), b_ref) return min(times), perf_size * dtype.itemsize
def test_summation(thr): perf_size = 2 ** 22 dtype = dtypes.normalize_type(numpy.int64) a = get_test_array(perf_size, dtype) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(dtype)) b_dev = thr.empty_like(rd.parameter.output) b_ref = numpy.array([a.sum()], dtype) rdc = rd.compile(thr) attempts = 10 times = [] for i in range(attempts): t1 = time.time() rdc(b_dev, a_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(b_dev.get(), b_ref) return min(times), perf_size * dtype.itemsize
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, alpha.shape) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ ${alpha.ctype} alpha = ${alpha.load_same}; ${beta.ctype} beta = ${beta.load_same}; ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${output.store_same}(t.x - ${ordering}); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), ordering=ordering(self._representation), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, alpha.shape) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ ${alpha.ctype} alpha = ${alpha.load_same}; ${beta.ctype} beta = ${beta.load_same}; ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y)); ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y); ${output.store_same}(cp.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def _build_plan(self, plan_factory, device_params, output, matrix, vector): plan = plan_factory() summation = Reduce(matrix, predicate_sum(matrix.dtype), axes=(len(matrix.shape) - 1, )) mul_vec = Transformation( [ Parameter('output', Annotation(matrix, 'o')), Parameter('matrix', Annotation(matrix, 'i')), Parameter('vector', Annotation(vector, 'i')) ], """ ${output.store_same}(${mul}(${matrix.load_same}, ${vector.load_idx}(${idxs[-1]}))); """, render_kwds=dict(mul=functions.mul(matrix.dtype, vector.dtype)), connectors=['output', 'matrix']) summation.parameter.input.connect(mul_vec, mul_vec.output, matrix=mul_vec.matrix, vector=mul_vec.vector) plan.computation_call(summation, output, matrix, vector) return plan
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = input_.shape[-1] * 4 batch_shape = input_.shape[:-1] batch_size = helpers.product(batch_shape) # The first element is unused coeffs = numpy.concatenate( [[0], 1 / (4 * numpy.sin(2 * numpy.pi * numpy.arange(1, N // 2) / N))]) coeffs_arr = plan.persistent_array(coeffs) prepare_iprfft_input = get_prepare_iprfft_input(input_) prepare_iprfft_output = get_prepare_iprfft_output(output) irfft = IRFFT(prepare_iprfft_input.Y) irfft.parameter.input.connect(prepare_iprfft_input, prepare_iprfft_input.Y, X=prepare_iprfft_input.X) irfft.parameter.output.connect(prepare_iprfft_output, prepare_iprfft_output.y, x=prepare_iprfft_output.x, x0=prepare_iprfft_output.x0, coeffs=prepare_iprfft_output.coeffs) real = Transformation([ Parameter( 'output', Annotation(Type(dtypes.real_for(input_.dtype), input_.shape), 'o')), Parameter('input', Annotation(input_, 'i')), ], """ ${output.store_same}((${input.load_same}).x); """, connectors=['output']) rd_t = Type(output.dtype, input_.shape) rd = Reduce(rd_t, predicate_sum(rd_t.dtype), axes=(len(input_.shape) - 1, )) rd.parameter.input.connect(real, real.output, X=real.input) x0 = plan.temp_array_like(rd.parameter.output) plan.computation_call(rd, x0, input_) plan.computation_call(irfft, output, x0, coeffs_arr, input_) return plan
def test_normal(thr, shape, axis): a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(numpy.int64), axes=(axis,) if axis is not None else None) b_dev = thr.empty_like(rd.parameter.output) b_ref = a.sum(axis) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def test_nonsequential_axes(thr): shape = (50, 40, 30, 20) a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) b_ref = a.sum(0).sum(1) # sum over axes 0 and 2 of the initial array rd = Reduce(a_dev, predicate_sum(numpy.int64), axes=(0, 2)) b_dev = thr.empty_like(rd.parameter.output) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def test_nonsequential_axes(thr): shape = (50, 40, 30, 20) a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) b_ref = a.sum(0).sum(1) # sum over axes 0 and 2 of the initial array rd = Reduce(a_dev, predicate_sum(numpy.int64), axes=(0,2)) b_dev = thr.empty_like(rd.parameter.output) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def test_normal(thr, shape, axis): a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(numpy.int64), axes=(axis, ) if axis is not None else None) b_dev = thr.empty_like(rd.parameter.output) b_ref = a.sum(axis) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def check_scan(thr, shape, axes, exclusive=False, measure_time=False, dtype=numpy.int64, max_work_group_size=None, predicate=None, seq_size=None): # Note: the comparison will only work if the custom predicate is # functionally equivalent to `predicate_sum`. if predicate is None: predicate = predicate_sum(dtype) arr = get_test_array(shape, dtype) scan = Scan(arr, predicate, axes=axes, exclusive=exclusive, max_work_group_size=max_work_group_size, seq_size=seq_size).compile(thr) arr_dev = thr.to_device(arr) res_dev = thr.to_device(numpy.ones_like(arr) * (-1)) #thr.empty_like(arr) if measure_time: attempts = 10 times = [] for i in range(attempts): t1 = time.time() scan(res_dev, arr_dev) thr.synchronize() times.append(time.time() - t1) min_time = min(times) else: scan(res_dev, arr_dev) min_time = None res_test = res_dev.get() res_ref = ref_scan(arr, axes=axes, exclusive=exclusive) assert diff_is_negligible(res_ref, res_test) return min_time
def __init__(self, arr_t, order=2, axes=None): tr_elems = norm_const(arr_t, order) out_dtype = tr_elems.output.dtype rd = Reduce(Type(out_dtype, arr_t.shape), predicate_sum(out_dtype), axes=axes) res_t = rd.parameter.output tr_sum = norm_const(res_t, 1. / order) rd.parameter.input.connect(tr_elems, tr_elems.output, input_prime=tr_elems.input) rd.parameter.output.connect(tr_sum, tr_sum.input, output_prime=tr_sum.output) self._rd = rd Computation.__init__(self, [ Parameter('output', Annotation(res_t, 'o')), Parameter('input', Annotation(arr_t, 'i'))])
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = input_.shape[-1] * 4 batch_shape = input_.shape[:-1] batch_size = helpers.product(batch_shape) # The first element is unused coeffs = numpy.concatenate( [[0], 1 / (4 * numpy.sin(2 * numpy.pi * numpy.arange(1, N//2) / N))]) coeffs_arr = plan.persistent_array(coeffs) prepare_iprfft_input = get_prepare_iprfft_input(input_) prepare_iprfft_output = get_prepare_iprfft_output(output) irfft = IRFFT(prepare_iprfft_input.Y) irfft.parameter.input.connect( prepare_iprfft_input, prepare_iprfft_input.Y, X=prepare_iprfft_input.X) irfft.parameter.output.connect( prepare_iprfft_output, prepare_iprfft_output.y, x=prepare_iprfft_output.x, x0=prepare_iprfft_output.x0, coeffs=prepare_iprfft_output.coeffs) real = Transformation( [ Parameter('output', Annotation(Type(dtypes.real_for(input_.dtype), input_.shape), 'o')), Parameter('input', Annotation(input_, 'i')), ], """ ${output.store_same}((${input.load_same}).x); """, connectors=['output'] ) rd_t = Type(output.dtype, input_.shape) rd = Reduce(rd_t, predicate_sum(rd_t.dtype), axes=(len(input_.shape)-1,)) rd.parameter.input.connect(real, real.output, X=real.input) x0 = plan.temp_array_like(rd.parameter.output) plan.computation_call(rd, x0, input_) plan.computation_call(irfft, output, x0, coeffs_arr, input_) return plan
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_click_order)) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ VSIZE_T sample_idx = ${idxs[0]}; VSIZE_T order = ${idxs[1]} + 1; ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0); for (VSIZE_T i = 0; i < ${modes}; i++) { ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i); ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i); ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y)); if (i >= order) { result = ${mul_cc}(result, np); } else { ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y); result = ${mul_cc}(result, cp); } } ${output.store_same}(result.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), modes=self._system.modes, )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def _build_plan( self, plan_factory, device_params, ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b): plan = plan_factory() extracted_n, t, base, inner_n = ks_a.shape mean = Reduce(noises_b, predicate_sum(noises_b.dtype)) norm = transformations.div_const(mean.parameter.output, numpy.prod(noises_b.shape)) mean.parameter.output.connect(norm, norm.input, mean=norm.output) noises_b_mean = plan.temp_array_like(mean.parameter.mean) mul_key = MatrixMulVector(noises_a) b_term = plan.temp_array_like(mul_key.parameter.output) build_keyswitch = PureParallel([ Parameter('ks_a', Annotation(ks_a, 'o')), Parameter('ks_b', Annotation(ks_b, 'o')), Parameter('ks_cv', Annotation(ks_cv, 'o')), Parameter('in_key', Annotation(in_key, 'i')), Parameter('b_term', Annotation(b_term, 'i')), Parameter('noises_a', Annotation(noises_a, 'i')), Parameter('noises_b', Annotation(noises_b, 'i')), Parameter('noises_b_mean', Annotation(noises_b_mean, 'i'))], Snippet( TEMPLATE.get_def("make_lwe_keyswitch_key"), render_kwds=dict( log2_base=self._log2_base, output_size=self._output_size, double_to_t32=double_to_t32_module, noise=self._noise)), guiding_array="ks_b") plan.computation_call(mean, noises_b_mean, noises_b) plan.computation_call(mul_key, b_term, noises_a, out_key) plan.computation_call( build_keyswitch, ks_a, ks_b, ks_cv, in_key, b_term, noises_a, noises_b, noises_b_mean) return plan
def check_scan( thr, shape, axes, exclusive=False, measure_time=False, dtype=numpy.int64, max_work_group_size=None, predicate=None, seq_size=None): # Note: the comparison will only work if the custom predicate is # functionally equivalent to `predicate_sum`. if predicate is None: predicate = predicate_sum(dtype) arr = get_test_array(shape, dtype) scan = Scan( arr, predicate, axes=axes, exclusive=exclusive, max_work_group_size=max_work_group_size, seq_size=seq_size).compile(thr) arr_dev = thr.to_device(arr) res_dev = thr.to_device(numpy.ones_like(arr) * (-1))#thr.empty_like(arr) if measure_time: attempts = 10 times = [] for i in range(attempts): t1 = time.time() scan(res_dev, arr_dev) thr.synchronize() times.append(time.time() - t1) min_time = min(times) else: scan(res_dev, arr_dev) min_time = None res_test = res_dev.get() res_ref = ref_scan(arr, axes=axes, exclusive=exclusive) assert diff_is_negligible(res_ref, res_test) return min_time
def sample(data, t, samplers): sample_dict = {} stop_integration = False for key, sampler in samplers.items(): try: sample = sampler(data, t) except StopIntegration as e: sample = e.args[0] stop_integration = True sample_dict[key] = dict(trajectories=sample.shape[0], time=t) if isinstance(sample, numpy.ndarray): if not sampler.no_values: sample_dict[key]['values'] = sample.copy() if not sampler.no_mean: sample_dict[key]['mean'] = sample.mean(0) if not sampler.no_stderr: sample_dict[key]['stderr'] = sample.std(0) / numpy.sqrt(sample.shape[0]) else: thr = sample.thread if not sampler.no_values: sample_dict[key]['values'] = sample.get() if not sampler.no_mean: sum_vals = Reduce(sample, predicate_sum(sample.dtype), axes=(0,)).compile(thr) sum_dev = thr.empty_like(sum_vals.parameter.output) sum_vals(sum_dev, sample) sample_dict[key]['mean'] = sum_dev.get() / sample.shape[0] if not sampler.no_stderr: norm2 = EntrywiseNorm(sample, order=2, axes=(0,)).compile(thr) n2_dev = thr.empty_like(norm2.parameter.output) norm2(n2_dev, sample) std = n2_dev.get() / numpy.sqrt(sample.shape[0]) sample_dict[key]['stderr'] = std / numpy.sqrt(sample.shape[0]) return sample_dict, stop_integration
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_moment)) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ VSIZE_T sample_idx = ${idxs[0]}; VSIZE_T order = ${idxs[1]}; ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0); for (VSIZE_T i = 0; i <= order; i++) { ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i); ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i); ${alpha.ctype} t = ${mul_cc}(alpha, beta); t.x -= ${ordering}; result = ${mul_cc}(result, t); } ${output.store_same}(result.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), ordering=ordering(self._representation), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() samples, modes = alpha.shape for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1)) prepared_state = plan.temp_array_like(alpha) plan.kernel_call( TEMPLATE.get_def("compound_click_probability_prepare"), [prepared_state, alpha, beta], kernel_name="compound_click_probability_prepare", global_size=alpha.shape, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) # Block size is limited by the amount of available local memory. # In some OpenCL implementations the number reported cannot actually be fully used # (because it's used by kernel arguments), so we're padding it a little. local_mem_size = device_params.local_mem_size max_elems = (local_mem_size - 256) // alpha.dtype.itemsize block_size = 2**helpers.log2(max_elems) # No reason to have block size larger than the number of modes block_size = min(block_size, helpers.bounding_power_of_2(modes)) products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size) products = plan.temp_array_like(for_reduction) read_size = min(block_size, device_params.max_work_group_size) while read_size > 1: full_steps = modes // block_size remainder_size = modes % block_size try: plan.kernel_call( TEMPLATE.get_def("compound_click_probability_aggregate"), [products, prepared_state], kernel_name="compound_click_probability_aggregate", global_size=products_gsize, local_size=(1, read_size,), render_kwds=dict( block_size=block_size, read_size=read_size, full_steps=full_steps, remainder_size=remainder_size, output_size=self._max_total_clicks + 1, mul_cc=functions.mul(alpha.dtype, alpha.dtype), add_cc=functions.add(alpha.dtype, alpha.dtype), polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)), modes=self._system.modes, max_total_clicks=self._max_total_clicks, )) except OutOfResourcesError: read_size //= 2 break reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,)) temp = plan.temp_array_like(reduction.parameter.output) plan.computation_call(reduction, temp, products) fft = FFT(temp) real_trf = Transformation([ Parameter('output', Annotation(output, 'o')), Parameter('input', Annotation(temp, 'i')), ], """ ${input.ctype} val = ${input.load_same}; ${output.store_same}(val.x); """) fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output) plan.computation_call(fft, output, temp, True) return plan
def solve(nufft,gy, solver=None, maxiter=30, *args, **kwargs): """ The solve function of NUFFT_hsa. The current version supports solvers = 'cg' or 'L1TVOLS'. :param nufft: NUFFT_hsa object :param y: (M,) or (M, batch) array, non-uniform data. If batch is provided, 'cg' and 'L1TVOLS' returns different image shape. :type y: numpy.complex64 reikna array :return: x: Nd or Nd + (batch, ) image. L1TVOLS always returns Nd. 'cg' returns Nd + (batch, ) in batch mode. :rtype: x: reikna array, complex64. """ # define the reduction kernel on the device # if None == solver: # solver = 'cg' if 'L1TVLAD' == solver: x2=L1TVLAD(nufft, gy, maxiter=maxiter, *args, **kwargs ) # x2 = nufft.thr.copy_array(nufft.x_Nd) return x2 elif 'L1TVOLS' == solver: x2=L1TVOLS(nufft, gy, maxiter=maxiter, *args, **kwargs ) # x2 = nufft.thr.copy_array(nufft.x_Nd) return x2 elif 'dc' == solver: """ Density compensation method nufft.st['W'] will be computed if doesn't exist If nufft.st['W'] exist then x2 = nufft.adjoint(nufft.st['W']*y) input: y: (M,) array output: x2: Nd array """ print(solver, ":density compensation method. I won't recommend it as the GPU version is not needed! Try the CPU version") # nufft.st['W'] = nufft._pipe_density(maxiter=maxiter,*args, **kwargs) # # x2 = nufft.adjoint(nufft.st['W']*gy) return x2 # return gx elif 'cg' == solver: from reikna.algorithms import Reduce, Predicate, predicate_sum nufft.reduce_sum = Reduce(numpy.zeros(nufft.Kd, dtype = nufft.dtype), predicate_sum(dtype)).compile(nufft.thr) # nufft.reduce_sum = nufft.reduce_sum.compile(nufft.thr) # update: b = spH * gy b = nufft._y2k_device(gy) # Initialize x = b x = nufft.thr.copy_array( b) rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # rsold.fill(0.0+0.0j) nufft.reduce_sum(rsold, x) # print('x',rsold) # initialize r = b - A * x r = nufft.thr.empty_like( b) # r.fill(0.0 + 0.0j) y_tmp = nufft._k2y_device(x) Ax = nufft._y2k_device(y_tmp) del y_tmp rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # rsold.fill(0.0 + 0.0j) nufft.reduce_sum(rsold, Ax) # print('Ax',rsold) nufft.prg.cAddVec(b, - Ax, r , local_size=None, global_size = int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() # p = r p = nufft.thr.copy_array(r) # rsold = r' * r tmp_array = nufft.thr.empty_like( r) # tmp_array.fill(0.0 + 0.0j) nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # rsold.fill(0.0 + 0.0j) nufft.reduce_sum(rsold, tmp_array) # allocate Ap # Ap = nufft.thr.empty_like( b) rsnew = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # rsnew.fill(0.0 + 0.0j) tmp_sum = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # tmp_sum.fill(0.0 + 0.0j) for pp in range(0, maxiter): tmp_p = nufft._k2y_device(p) Ap = nufft._y2k_device(tmp_p) del tmp_p # alpha = rs_old/(p'*Ap) nufft.prg.cMultiplyConjVec(p, Ap, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() nufft.reduce_sum(tmp_sum, tmp_array) alpha = rsold / tmp_sum # alpha_cpu = alpha.get() # if numpy.isnan(alpha_cpu): # alpha_cpu = 0 # avoid singularity # print(tmp_sum, alpha, rsold) # print(pp,rsold , alpha, numpy.sum(tmp_array.get()) ) # x = x + alpha*p p2 = nufft.thr.copy_array(p) nufft.prg.cMultiplyScalar(alpha.get(), p2, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() # nufft.prg.cAddVec(x, alpha, local_size=None, global_size=int(nufft.Kdprod)) x += p2 # r = r - alpha * Ap p2= nufft.thr.copy_array(Ap) # nufft.thr.synchronize() nufft.prg.cMultiplyScalar(alpha.get(), p2, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() r -= p2 # print(pp, numpy.sum(x.get()), numpy.sum(r.get())) # rs_new = r'*r nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() nufft.reduce_sum(rsnew, tmp_array) # tmp_sum = p = r + (rs_new/rs_old)*p beta = rsnew/rsold # beta_cpu = beta.get() # if numpy.isnan(beta_cpu): # beta_cpu = 0 # print(beta, rsnew, rsold) p2= nufft.thr.copy_array(p) nufft.prg.cMultiplyScalar(beta.get(), p2, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() nufft.prg.cAddVec(r, p2, p, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() p = r + p2 rsold =nufft.thr.copy_array( rsnew) # nufft.thr.synchronize() # end of iteration # copy result to k_Kd2 # nufft.k_Kd2 = nufft.thr.copy_array(x) # inverse FFT: k_Kd2 -> x_Nd x2 = nufft._k2xx_device(x) # x is the solved k space # rescale the SnGPUArray # x2 /= nufft.volume['gpu_sense2'] # x3 = nufft.x2s(x2) # combine multi-coil to single-coil try: x2 /= nufft.volume['SnGPUArray'] except: nufft.prg.cTensorMultiply(numpy.uint32(nufft.batch), numpy.uint32(nufft.tSN['Tdims']), nufft.tSN['Td'], nufft.tSN['Td_elements'], nufft.tSN['invTd_elements'], nufft.tSN['tensor_sn'], x2, numpy.uint32(1), # division, 1 is true local_size = None, global_size = int(nufft.batch*nufft.Ndprod)) return x2