def _build_plan(self, plan_factory, device_params, alpha, beta, seed): plan = plan_factory() bijection = philox(64, 2) # Keeping the kernel the same so it can be cached. # The seed will be passed as the computation parameter instead. keygen = KeyGenerator.create(bijection, seed=numpy.int32(0)) sampler = normal_bm(bijection, numpy.float64) squeezing = plan.persistent_array(self._system.squeezing) decoherence = plan.persistent_array(self._system.decoherence) plan.kernel_call(TEMPLATE.get_def("generate_input_state"), [alpha, beta, squeezing, decoherence, seed], kernel_name="generate", global_size=alpha.shape, render_kwds=dict( system=self._system, representation=self._representation, Representation=Representation, bijection=bijection, keygen=keygen, sampler=sampler, ordering=ordering, exp=functions.exp(numpy.float64), mul_cr=functions.mul(numpy.complex128, numpy.float64), add_cc=functions.add(numpy.complex128, numpy.complex128), )) return plan
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, alpha.shape) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ ${alpha.ctype} alpha = ${alpha.load_same}; ${beta.ctype} beta = ${beta.load_same}; ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y)); ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y); ${output.store_same}(cp.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def kspacegaussian_filter_CL(ksp, sigma): from reikna import cluda from reikna.cluda import functions, dtypes sz = np.array(ksp.shape) dtype = np.complex64 ftype = np.float32 api = cluda.ocl_api() thr = api.Thread.create() FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ${ultype} x = (${ultype}) get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ${ultype} idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); //Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), ultype=dtypes.ctype(np.uint64), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel data_dev = thr.empty_like(ksp) gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) ksp_out = data_dev.get() ifft = FFT(data_dev) cifft = ifft.compile(thr) cifft(data_dev, data_dev, inverse=0) result = np.fft.fftshift(data_dev.get() / sz[0] * sz[1] * sz[2]) result = result[::-1, ::-1, ::-1] result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0) return ksp_out
def get_kprop_trf(state_arr, ksquared_arr, coeffs, exp=False): compound_dtype = dtypes.result_type(coeffs.dtype, ksquared_arr.dtype) return Transformation( [ Parameter('output', Annotation(state_arr, 'o')), Parameter('input', Annotation(state_arr, 'i')), Parameter('ksquared', Annotation(ksquared_arr, 'i')), Parameter('dt', Annotation(ksquared_arr.dtype))], """ %if max(coeffs.values) > 0: ${ksquared.ctype} ksquared = ${ksquared.load_idx}(${', '.join(idxs[2:])}); %endif ${dtypes.ctype(compound_dtype)} compound_coeff = ${dtypes.c_constant(0, compound_dtype)}; %for pwr, values in coeffs.values.items(): { ${dtypes.ctype(coeffs.dtype)} value; %for comp in range(output.shape[1]): ${'if' if comp == 0 else 'else if'} (${idxs[1]} == ${comp}) { value = ${dtypes.c_constant(values[comp], coeffs.dtype)}; } %endfor compound_coeff = compound_coeff + ${mul_kc}( %if pwr == 0: ${dt} %elif pwr == 2: -ksquared * ${dt} %else: pow(-ksquared, ${pwr // 2}) * ${dt} %endif , value ); } %endfor ${output.store_same}(${mul_ic}( ${input.load_same}, %if exp is not None: ${exp}(compound_coeff) %else: compound_coeff %endif )); """, render_kwds=dict( coeffs=coeffs, compound_dtype=compound_dtype, mul_ic=functions.mul(state_arr.dtype, compound_dtype, out_dtype=state_arr.dtype), mul_kc=functions.mul(ksquared_arr.dtype, coeffs.dtype, out_dtype=compound_dtype), exp=functions.exp(compound_dtype) if exp else None))
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_click_order)) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ VSIZE_T sample_idx = ${idxs[0]}; VSIZE_T order = ${idxs[1]} + 1; ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0); for (VSIZE_T i = 0; i < ${modes}; i++) { ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i); ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i); ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y)); if (i >= order) { result = ${mul_cc}(result, np); } else { ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y); result = ${mul_cc}(result, cp); } } ${output.store_same}(result.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), modes=self._system.modes, )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def gaussian_fourierkernel(siz, sigma_): """ Create Gaussian Fourier filter kernel with GPU """ if not hasattr(sigma, "__len__"): # type(sigma) is float: sigma = np.ones(3) * sigma_ elif len(sigma) == 2: sigma[2] = 0.0 sz = siz ctype = np.complex64 ftype = np.float32 #api = cluda.ocl_api() api = any_api() thr = api.Thread.create() base = np.ones(siz, ctype) data_dev = thr.to_device(base) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); //Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(ctype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) gfilter = data_dev.get() thr.synchronize() thr.release() return gfilter
def kspacegaussian_filter_CL2(ksp, sigma): """ Kspace gaussian filter and recon using GPU OpenCL 1. GPU intialisation 2. push KSP complex matrix to GPU 3. declare FFT program 4. declare Complex Gaussian GPU filter program 5. Execute Gaussian GPU program 6. GPU sync 7. Execute FFT Recon 8. Execute FFTshift 9. Retrieve reconstruced complex image from GPU 10. Reorganise image to standard (mimic numpy format) """ sz = ksp.shape dtype = np.complex64 ftype = np.float32 ultype = np.uint64 #api = cluda.ocl_api() api = any_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); // Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x), (double)dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); // ${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; // ${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) thr.synchronize() # Recon #data_dev = thr.to_device(ksp) ifftobj = FFT(data_dev) cifft = ifftobj.compile(thr) fftshiftobj = FFTShift(data_dev) cfftshift = fftshiftobj.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2
def test_exp(thr, out_code, in_codes): out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) check_func(thr, functions.exp(in_dtypes[0]), numpy.exp, out_dtype, in_dtypes)
def kspacegaussian_filter_CL2(ksp, sigma): sz = ksp.shape dtype = np.complex64 ftype = np.float32 #api = cluda.ocl_api() api = cuda_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; // const ${ftype} SQRT2PI = 2.5066282746; // const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); //Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight * factor; dest[idx].y = src[idx].y * weight * factor; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(data_dev, data_dev, global_size=(sz[0], sz[1], sz[2])) thr.synchronize() #data_dev = thr.to_device(ksp) ifft = FFT(data_dev) cifft = ifft.compile(thr) fftshift = FFTShift(data_dev) cfftshift = fftshift.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2
def test_exp(thr, out_code, in_codes): out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) check_func(thr, functions.exp(in_dtypes[0]), numpy.exp, out_dtype, in_dtypes)
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() samples, modes = alpha.shape for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1)) prepared_state = plan.temp_array_like(alpha) plan.kernel_call( TEMPLATE.get_def("compound_click_probability_prepare"), [prepared_state, alpha, beta], kernel_name="compound_click_probability_prepare", global_size=alpha.shape, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) # Block size is limited by the amount of available local memory. # In some OpenCL implementations the number reported cannot actually be fully used # (because it's used by kernel arguments), so we're padding it a little. local_mem_size = device_params.local_mem_size max_elems = (local_mem_size - 256) // alpha.dtype.itemsize block_size = 2**helpers.log2(max_elems) # No reason to have block size larger than the number of modes block_size = min(block_size, helpers.bounding_power_of_2(modes)) products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size) products = plan.temp_array_like(for_reduction) read_size = min(block_size, device_params.max_work_group_size) while read_size > 1: full_steps = modes // block_size remainder_size = modes % block_size try: plan.kernel_call( TEMPLATE.get_def("compound_click_probability_aggregate"), [products, prepared_state], kernel_name="compound_click_probability_aggregate", global_size=products_gsize, local_size=(1, read_size,), render_kwds=dict( block_size=block_size, read_size=read_size, full_steps=full_steps, remainder_size=remainder_size, output_size=self._max_total_clicks + 1, mul_cc=functions.mul(alpha.dtype, alpha.dtype), add_cc=functions.add(alpha.dtype, alpha.dtype), polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)), modes=self._system.modes, max_total_clicks=self._max_total_clicks, )) except OutOfResourcesError: read_size //= 2 break reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,)) temp = plan.temp_array_like(reduction.parameter.output) plan.computation_call(reduction, temp, products) fft = FFT(temp) real_trf = Transformation([ Parameter('output', Annotation(output, 'o')), Parameter('input', Annotation(temp, 'i')), ], """ ${input.ctype} val = ${input.load_same}; ${output.store_same}(val.x); """) fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output) plan.computation_call(fft, output, temp, True) return plan
double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; //(${ftype})k; // } """ % (N, N, N, sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype( ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel r1 = np.ones((N, N, N)).astype(ftype) # /N r2 = np.ones((N, N, N)).astype(ftype) # /N a = r1 + 1j * r2 b = r1 - 1j * r2 a_dev = thr.to_device(a) #b_dev = thr.to_device(b) #c_dev= thr.to_device(b.ravel()) #sigma_dev = thr.to_device(sigma) dest_dev = thr.empty_like(a_dev) # (np.pi).astype(np.float32),