def logistic(context, activations, bias, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = activations key = (logistic, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) assert activations.shape[1] == bias.shape[0] kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${bias.ctype} b = ${bias.load_idx}(${idxs[1]}); a += b; a = min(max(-45.0f, a), 45.0f); a = 1.0f / (1.0f + exp(-a)); ${dest.store_same}(a); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, bias, dest) return dest
def classification_delta_kernel(ctx, outputs, targets, deltas): kernel_cache, thread = ctx.kernel_cache, ctx.thread assert outputs.shape[0] == targets.shape[0] == deltas.shape[0] assert len(targets.shape) == 1 assert targets.dtype == numpy.int32 assert outputs.shape[1] == deltas.shape[1] key = (classification_delta_kernel, outputs.shape) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('outputs', Annotation(outputs, 'i')), Parameter('targets', Annotation(targets, 'i')), Parameter('deltas', Annotation(deltas, 'o')) ], """ ${outputs.ctype} out = ${outputs.load_same}; SIZE_T t = ${targets.load_idx}(${idxs[0]}); SIZE_T idx = ${idxs[1]}; ${deltas.ctype} d; if (t == idx) { d = 1.0f - out; } else { d = -out; } ${deltas.store_same}(d); """, guiding_array='deltas') kernel_cache[key] = kernel.compile(thread) # Run kernel kernel_cache[key](outputs, targets, deltas)
def test_guiding_output(thr): N = 1000 dtype = numpy.float32 p = PureParallel([ Parameter('output', Annotation(Type(dtype, shape=N), 'o')), Parameter('input', Annotation(Type(dtype, shape=(2, N)), 'i')) ], """ float t1 = ${input.load_idx}(0, ${idxs[0]}); float t2 = ${input.load_idx}(1, ${idxs[0]}); ${output.store_idx}(${idxs[0]}, t1 + t2); """, guiding_array='output') a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev) res_ref = a[0] + a[1] assert diff_is_negligible(res_dev.get(), res_ref)
def test_guiding_output(thr): N = 1000 dtype = numpy.float32 p = PureParallel( [ Parameter('output', Annotation(Type(dtype, shape=N), 'o')), Parameter('input', Annotation(Type(dtype, shape=(2, N)), 'i'))], """ float t1 = ${input.load_idx}(0, ${idxs[0]}); float t2 = ${input.load_idx}(1, ${idxs[0]}); ${output.store_idx}(${idxs[0]}, t1 + t2); """, guiding_array='output') a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev) res_ref = a[0] + a[1] assert diff_is_negligible(res_dev.get(), res_ref)
def logistic(context, activations, bias, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = activations key = (logistic, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) assert activations.shape[1] == bias.shape[0] kernel = PureParallel( [ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${bias.ctype} b = ${bias.load_idx}(${idxs[1]}); a += b; a = min(max(-45.0f, a), 45.0f); a = 1.0f / (1.0f + exp(-a)); ${dest.store_same}(a); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, bias, dest) return dest
def classification_delta_kernel(ctx, outputs, targets, deltas): kernel_cache, thread = ctx.kernel_cache, ctx.thread assert outputs.shape[0] == targets.shape[0] == deltas.shape[0] assert len(targets.shape) == 1 assert targets.dtype == numpy.int32 assert outputs.shape[1] == deltas.shape[1] key = (classification_delta_kernel, outputs.shape) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel( [ Parameter('outputs', Annotation(outputs, 'i')), Parameter('targets', Annotation(targets, 'i')), Parameter('deltas', Annotation(deltas, 'o')) ], """ ${outputs.ctype} out = ${outputs.load_same}; SIZE_T t = ${targets.load_idx}(${idxs[0]}); SIZE_T idx = ${idxs[1]}; ${deltas.ctype} d; if (t == idx) { d = 1.0f - out; } else { d = -out; } ${deltas.store_same}(d); """, guiding_array='deltas') kernel_cache[key] = kernel.compile(thread) # Run kernel kernel_cache[key](outputs, targets, deltas)
def logistic_derivative(context, activations, delta, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = delta key = (logistic_derivative, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('delta', Annotation(activations, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${delta.ctype} d = ${delta.load_same}; d = d*a*(1.0f - a); ${dest.store_same}(d); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, delta, dest)
def logistic_derivative(context, activations, delta, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = delta key = (logistic_derivative, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel( [ Parameter('activations', Annotation(activations, 'i')), Parameter('delta', Annotation(activations, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${delta.ctype} d = ${delta.load_same}; d = d*a*(1.0f - a); ${dest.store_same}(d); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, delta, dest)
def class_errors(ctx, expected, actual, errors): """ expected int32, actual float, errors int32 """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (class_errors, expected.shape) if key not in kernel_cache.keys(): # target should be an integer logging.info("compiling " + str(key)) assert expected.shape == errors.shape # one neuron per class assert expected.shape == (actual.shape[0], ) # index of the class assert actual.dtype == numpy.float32 assert expected.dtype == numpy.int32 assert errors.dtype == numpy.int32 kernel = PureParallel( [ Parameter('expected', Annotation(expected, 'i')), Parameter('actual', Annotation(actual, 'i')), Parameter('errors', Annotation(errors, 'o')) ], """ SIZE_T expected = ${expected.load_idx}(${idxs[0]});; float maximum=0.0f; float value; SIZE_T maxindex = 0; SIZE_T tl = ${target_length}; // calculate argmax for(SIZE_T j=0; j < tl; j++) { value = ${actual.load_idx}(${idxs[0]}, j); if (value > maximum) { maximum = value; maxindex = j; } } // If the confidence is too low, return an error if (maximum < (1.0f / ${target_length}.0f + 0.001f)) { ${errors.store_same}(1); return; }; // compare argmax if (maxindex != expected) { ${errors.store_same}(1); } else { ${errors.store_same}(0); } """, guiding_array='expected', render_kwds={'target_length': numpy.int32(actual.shape[1])}) kernel_cache[key] = kernel.compile(thread) kernel_cache[key](expected, actual, errors)
def convolve2d_propagation(ctx, array, weights, dest): """ The output is the valid discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_propagation, weights.shape, array.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling" + str(key)) channels, filters, owidth, oheight = weights.shape[0], weights.shape[ 1], dest.shape[1], dest.shape[2] render_kwds = { 'w0': weights.shape[2], 'w1': weights.shape[3], 'a0': array.shape[2], 'a1': array.shape[3], 'off0': int(weights.shape[2] - 1), 'off1': int(weights.shape[3] - 1) } kernel_conv = PureParallel([ Parameter('array', Annotation(array, 'i')), Parameter('weights', Annotation(weights, 'i')), Parameter('dest', Annotation(dest, 'o')) ], """ // Array dimensions: // array : (channels, width, height) // weights: (channels, filters, fwidth, fheight) // dest (channels, filters, owidth, oheight) float a = 0.0f; SIZE_T x, y, i, j; const SIZE_T number = ${idxs[0]}; const SIZE_T channel = ${idxs[1]}; const SIZE_T filter = ${idxs[2]}; const SIZE_T xout = ${idxs[3]}; const SIZE_T yout = ${idxs[4]}; for (i=0; i < ${w0}; i++){ for (j=0; j < ${w1}; j++){ x = xout - i + ${off0}; y = yout - j + ${off1}; a += ${array.load_idx}(number, channel, x, y) * ${weights.load_idx}(channel, filter, i, j); // channel, filter, i, j } } ${dest.store_same}(a); """, guiding_array='dest', render_kwds=render_kwds) kernel_cache[key] = kernel_conv.compile(thread, fast_math=True) # run convolution kernel_cache[key](array, weights, dest) return dest
def convolve2d_propagation(ctx, array, weights, dest): """ The output is the valid discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_propagation, weights.shape, array.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling" + str(key)) channels, filters, owidth, oheight = weights.shape[0], weights.shape[1], dest.shape[1], dest.shape[2] render_kwds = { 'w0': weights.shape[2], 'w1': weights.shape[3], 'a0': array.shape[2], 'a1': array.shape[3], 'off0': int(weights.shape[2] - 1), 'off1': int(weights.shape[3] - 1) } kernel_conv = PureParallel( [ Parameter('array', Annotation(array, 'i')), Parameter('weights', Annotation(weights, 'i')), Parameter('dest', Annotation(dest, 'o')) ], """ // Array dimensions: // array : (channels, width, height) // weights: (channels, filters, fwidth, fheight) // dest (channels, filters, owidth, oheight) float a = 0.0f; SIZE_T x, y, i, j; const SIZE_T number = ${idxs[0]}; const SIZE_T channel = ${idxs[1]}; const SIZE_T filter = ${idxs[2]}; const SIZE_T xout = ${idxs[3]}; const SIZE_T yout = ${idxs[4]}; for (i=0; i < ${w0}; i++){ for (j=0; j < ${w1}; j++){ x = xout - i + ${off0}; y = yout - j + ${off1}; a += ${array.load_idx}(number, channel, x, y) * ${weights.load_idx}(channel, filter, i, j); // channel, filter, i, j } } ${dest.store_same}(a); """, guiding_array='dest', render_kwds=render_kwds) kernel_cache[key] = kernel_conv.compile( thread, fast_math=True) # run convolution kernel_cache[key](array, weights, dest) return dest
def class_errors(ctx, expected, actual, errors): """ expected int32, actual float, errors int32 """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (class_errors, expected.shape) if key not in kernel_cache.keys(): # target should be an integer logging.info("compiling " + str(key)) assert expected.shape == errors.shape # one neuron per class assert expected.shape == (actual.shape[0],) # index of the class assert actual.dtype == numpy.float32 assert expected.dtype == numpy.int32 assert errors.dtype == numpy.int32 kernel = PureParallel( [ Parameter('expected', Annotation(expected, 'i')), Parameter('actual', Annotation(actual, 'i')), Parameter('errors', Annotation(errors, 'o')) ], """ SIZE_T expected = ${expected.load_idx}(${idxs[0]});; float maximum=0.0f; float value; SIZE_T maxindex = 0; SIZE_T tl = ${target_length}; // calculate argmax for(SIZE_T j=0; j < tl; j++) { value = ${actual.load_idx}(${idxs[0]}, j); if (value > maximum) { maximum = value; maxindex = j; } } // If the confidence is too low, return an error if (maximum < (1.0f / ${target_length}.0f + 0.001f)) { ${errors.store_same}(1); return; }; // compare argmax if (maxindex != expected) { ${errors.store_same}(1); } else { ${errors.store_same}(0); } """, guiding_array='expected', render_kwds={'target_length' : numpy.int32(actual.shape[1])}) kernel_cache[key] = kernel.compile(thread) kernel_cache[key](expected, actual, errors)
def setitem_computation(dest, source): """ Returns a compiled computation that broadcasts ``source`` to ``dest``, where ``dest`` is a GPU array, and ``source`` is either a GPU array or a scalar. """ if len(source.shape) == 0: trf = transformations.broadcast_param(dest) return PureParallel.from_trf(trf, guiding_array=trf.output) else: source_dt = Type.from_value(source).with_dtype(dest.dtype) trf = transformations.copy(source_dt, dest) comp = PureParallel.from_trf(trf, guiding_array=trf.output) cast_trf = transformations.cast(source, dest.dtype) comp.parameter.input.connect(cast_trf, cast_trf.output, src_input=cast_trf.input) return comp
def _build_plan(self, plan_factory, device_params, a, current_variances, mu): plan = plan_factory() fill = PureParallel([ Parameter('a', Annotation(a, 'o')), Parameter('current_variances', Annotation(current_variances, 'o')), Parameter('mu', Annotation(mu, 'i'))], """ ${a.ctype} a; if (${idxs[-2]} == ${mask_size}) { a = ${mu.load_idx}(${", ".join(idxs[:-2])}, ${idxs[-1]}); } else { a = 0; } ${a.store_same}(a); if (${idxs[-1]} == 0) { ${current_variances.store_idx}(${", ".join(idxs[:-1])}, 0); } """, render_kwds=dict(mask_size=self._mask_size)) plan.computation_call(fill, a, current_variances, mu) return plan
def test_tgsw_polynomial_decomp_trf(thread): shape = (2, 3) params = NuFHEParameters() tgsw_params = params.tgsw_params decomp_length = tgsw_params.decomp_length mask_size = tgsw_params.tlwe_params.mask_size polynomial_degree = tgsw_params.tlwe_params.polynomial_degree sample = get_test_array(shape + (mask_size + 1, polynomial_degree), Torus32, (0, 1000)) result = numpy.empty(shape + (mask_size + 1, decomp_length, polynomial_degree), dtype=Int32) sample_dev = thread.to_device(sample) result_dev = thread.empty_like(result) trf = get_tgsw_polynomial_decomp_trf(tgsw_params, shape) test = PureParallel.from_trf(trf, guiding_array='result').compile(thread) ref = tgsw_polynomial_decomp_trf_reference(tgsw_params, shape) test(result_dev, sample_dev) result_test = result_dev.get() ref(result, sample) assert (result == result_test).all()
def test_trf_with_guiding_output(thr): """ Test the creation of ``PureParallel`` out of a transformation, with an output parameter as a guiding array. """ N = 1000 coeff = 3 dtype = numpy.float32 arr_t = Type(dtype, shape=N) trf = mul_param(arr_t, dtype) p = PureParallel.from_trf(trf, trf.output) # The new PureParallel has to preserve the parameter list of the original transformation. assert list(p.signature.parameters.values()) == list(trf.signature.parameters.values()) a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev, coeff) assert diff_is_negligible(res_dev.get(), a * 3)
def roll_computation(array, axis): return PureParallel([ Parameter('output', Annotation(array, 'o')), Parameter('input', Annotation(array, 'i')), Parameter('shift', Annotation(Type(numpy.int32))) ], """ <% shape = input.shape %> %for i in range(len(shape)): VSIZE_T output_${idxs[i]} = %if i == axis: ${shift} == 0 ? ${idxs[i]} : ## Since ``shift`` can be negative, and its absolute value greater than ## ``shape[i]``, a double modulo division is necessary ## (the ``%`` operator preserves the sign of the dividend in C). (${idxs[i]} + (${shape[i]} + ${shift} % ${shape[i]})) % ${shape[i]}; %else: ${idxs[i]}; %endif %endfor ${output.store_idx}( ${", ".join("output_" + name for name in idxs)}, ${input.load_idx}(${", ".join(idxs)})); """, guiding_array='input', render_kwds=dict(axis=axis))
def test_trf_with_guiding_output(thr): """ Test the creation of ``PureParallel`` out of a transformation, with an output parameter as a guiding array. """ N = 1000 coeff = 3 dtype = numpy.float32 arr_t = Type(dtype, shape=N) trf = mul_param(arr_t, dtype) p = PureParallel.from_trf(trf, trf.output) # The new PureParallel has to preserve the parameter list of the original transformation. assert list(p.signature.parameters.values()) == list( trf.signature.parameters.values()) a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev, coeff) assert diff_is_negligible(res_dev.get(), a * 3)
def identity(type): return PureParallel([ Parameter('output', Annotation(type, 'o')), Parameter('input', Annotation(type, 'i')) ], """ ${output.store_same}(${input.load_same}); """)
def test_from_trf(thr, guiding_array): """ Test the creation of ``PureParallel`` out of a transformation with various values of the guiding array. """ N = 1000 coeff = 3 dtype = numpy.float32 arr_t = Type(dtype, shape=N) trf = mul_param(arr_t, dtype) if guiding_array == 'input': arr = trf.input elif guiding_array == 'output': arr = trf.output elif guiding_array == 'none': arr = None p = PureParallel.from_trf(trf, guiding_array=arr) # The new PureParallel has to preserve the parameter list of the original transformation. assert list(p.signature.parameters.values()) == list(trf.signature.parameters.values()) a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev, coeff) assert diff_is_negligible(res_dev.get(), a * 3)
def _build_plan(self, plan_factory, device_params, output, input_, inverse): if helpers.product([input_.shape[i] for i in self._axes]) == 1: return self._build_trivial_plan(plan_factory, output, input_) plan = plan_factory() axes = tuple(sorted(self._axes)) shape = list(input_.shape) if all(shape[axis] % 2 == 0 for axis in axes): # If all shift axes have even length, it is possible to perform the shift inplace # (by swapping pairs of elements). # Note that the inplace fftshift is its own inverse. shape[axes[0]] //= 2 plan.kernel_call(TEMPLATE.get_def('fftshift_inplace'), [output, input_], kernel_name="kernel_fftshift_inplace", global_size=shape, render_kwds=dict(axes=axes)) else: # Resort to an out-of-place shift to a temporary array and then copy. temp = plan.temp_array_like(output) plan.kernel_call(TEMPLATE.get_def('fftshift_outplace'), [temp, input_, inverse], kernel_name="kernel_fftshift_outplace", global_size=shape, render_kwds=dict(axes=axes)) copy_trf = copy(input_, out_arr_t=output) copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input) plan.computation_call(copy_comp, output, temp) return plan
def _build_plan(self, plan_factory, device_params, output, input_, inverse): if helpers.product([input_.shape[i] for i in self._axes]) == 1: return self._build_trivial_plan(plan_factory, output, input_) plan = plan_factory() axes = tuple(sorted(self._axes)) shape = list(input_.shape) if all(shape[axis] % 2 == 0 for axis in axes): # If all shift axes have even length, it is possible to perform the shift inplace # (by swapping pairs of elements). # Note that the inplace fftshift is its own inverse. shape[axes[0]] //= 2 plan.kernel_call( TEMPLATE.get_def('fftshift_inplace'), [output, input_], kernel_name="kernel_fftshift_inplace", global_size=shape, render_kwds=dict(axes=axes)) else: # Resort to an out-of-place shift to a temporary array and then copy. temp = plan.temp_array_like(output) plan.kernel_call( TEMPLATE.get_def('fftshift_outplace'), [temp, input_, inverse], kernel_name="kernel_fftshift_outplace", global_size=shape, render_kwds=dict(axes=axes)) copy_trf = copy(input_, out_arr_t=output) copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input) plan.computation_call(copy_comp, output, temp) return plan
def _build_plan( self, plan_factory, device_params, result_a, result_b, result_cv, messages, key, noises_a, noises_b): plan = plan_factory() mul_key = MatrixMulVector(noises_a) fill_b_cv = Transformation([ Parameter('result_b', Annotation(result_b, 'o')), Parameter('result_cv', Annotation(result_cv, 'o')), Parameter('messages', Annotation(messages, 'i')), Parameter('noises_a_times_key', Annotation(noises_b, 'i')), Parameter('noises_b', Annotation(noises_b, 'i'))], """ ${result_b.store_same}( ${noises_b.load_same} + ${messages.load_same} + ${noises_a_times_key.load_same}); ${result_cv.store_same}(${noise**2}); """, connectors=['noises_a_times_key'], render_kwds=dict(noise=self._noise)) mul_key.parameter.output.connect( fill_b_cv, fill_b_cv.noises_a_times_key, b=fill_b_cv.result_b, cv=fill_b_cv.result_cv, messages=fill_b_cv.messages, noises_b=fill_b_cv.noises_b) plan.computation_call(mul_key, result_b, result_cv, messages, noises_b, noises_a, key) plan.computation_call( PureParallel.from_trf(transformations.copy(noises_a)), result_a, noises_a) return plan
def _build_plan( self, plan_factory, device_params, ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b): plan = plan_factory() extracted_n, t, base, inner_n = ks_a.shape mul_key = MatrixMulVector(noises_a) b_term = plan.temp_array_like(mul_key.parameter.output) build_keyswitch = PureParallel([ Parameter('ks_a', Annotation(ks_a, 'o')), Parameter('ks_b', Annotation(ks_b, 'o')), Parameter('ks_cv', Annotation(ks_cv, 'o')), Parameter('in_key', Annotation(in_key, 'i')), Parameter('b_term', Annotation(b_term, 'i')), Parameter('noises_a', Annotation(noises_a, 'i')), Parameter('noises_b', Annotation(noises_b, 'i'))], Snippet( TEMPLATE.get_def("make_lwe_keyswitch_key"), render_kwds=dict( log2_base=self._log2_base, output_size=self._output_size, noise=self._noise)), guiding_array="ks_b") plan.computation_call(mul_key, b_term, noises_a, out_key) plan.computation_call( build_keyswitch, ks_a, ks_b, ks_cv, in_key, b_term, noises_a, noises_b) return plan
def test_from_trf(thr, guiding_array): """ Test the creation of ``PureParallel`` out of a transformation with various values of the guiding array. """ N = 1000 coeff = 3 dtype = numpy.float32 arr_t = Type(dtype, shape=N) trf = mul_param(arr_t, dtype) if guiding_array == 'input': arr = trf.input elif guiding_array == 'output': arr = trf.output elif guiding_array == 'none': arr = None p = PureParallel.from_trf(trf, guiding_array=arr) # The new PureParallel has to preserve the parameter list of the original transformation. assert list(p.signature.parameters.values()) == list( trf.signature.parameters.values()) a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev, coeff) assert diff_is_negligible(res_dev.get(), a * 3)
def setitem_computation(dest, source, is_array): """ Returns a compiled computation that broadcasts ``source`` to ``dest``, where ``dest`` is a GPU array, and ``source`` is either a GPU array or a scalar. """ if is_array: source_dt = Type.from_value(source).with_dtype(dest.dtype) trf = transformations.copy(source_dt, dest) comp = PureParallel.from_trf(trf, guiding_array=trf.output) cast_trf = transformations.cast(source, dest.dtype) comp.parameter.input.connect(cast_trf, cast_trf.output, src_input=cast_trf.input) return comp else: trf = transformations.broadcast_param(dest) return PureParallel.from_trf(trf, guiding_array=trf.output)
def __init__(self, arr): copy_trf = copy(arr, out_arr_t=arr) self._copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input) Computation.__init__(self, [ Parameter('outer_output', Annotation(arr, 'o')), Parameter('outer_input', Annotation(arr, 'i'))])
def softmax(ctx, activations, bias, dest=None): """ Softmax Activation Function """ kernel_cache, thread = ctx.kernel_cache, ctx.thread if dest is None: dest = activations key = (softmax, activations.shape) if key not in kernel_cache.keys(): logging.info("compiling " + str(key)) # Regression hidden layer kernel_softmax = PureParallel( [ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ float x; float b; float s = 0.0f; SIZE_T tl = ${target_length}; for(SIZE_T j=0; j < tl; j++) { x = ${activations.load_idx}(${idxs[0]}, j); b = ${bias.load_idx}(j); x += b; x = exp(min(max(x, -45.0f), 45.0f)); ${dest.store_idx}(${idxs[0]}, j, x); s += x; } // divide by sum for(SIZE_T j=0; j < tl; j++) { x = ${dest.load_idx}(${idxs[0]}, j); x /= s; ${dest.store_idx}(${idxs[0]}, j, x); } """, guiding_array=(activations.shape[0], ), render_kwds={'target_length': numpy.int32(activations.shape[1])}) kernel_cache[key] = kernel_softmax.compile(thread) kernel_cache[key](activations, bias, dest)
def __init__(self, arr): copy_trf = copy(arr, out_arr_t=arr) self._copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input) Computation.__init__(self, [ Parameter('outer_output', Annotation(arr, 'o')), Parameter('outer_input', Annotation(arr, 'i')) ])
def softmax(ctx, activations, bias, dest=None): """ Softmax Activation Function """ kernel_cache, thread = ctx.kernel_cache, ctx.thread if dest is None: dest = activations key = (softmax, activations.shape) if key not in kernel_cache.keys(): logging.info("compiling " + str(key)) # Regression hidden layer kernel_softmax = PureParallel( [ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ float x; float b; float s = 0.0f; SIZE_T tl = ${target_length}; for(SIZE_T j=0; j < tl; j++) { x = ${activations.load_idx}(${idxs[0]}, j); b = ${bias.load_idx}(j); x += b; x = exp(min(max(x, -45.0f), 45.0f)); ${dest.store_idx}(${idxs[0]}, j, x); s += x; } // divide by sum for(SIZE_T j=0; j < tl; j++) { x = ${dest.load_idx}(${idxs[0]}, j); x /= s; ${dest.store_idx}(${idxs[0]}, j, x); } """, guiding_array=(activations.shape[0],), render_kwds={'target_length' : numpy.int32(activations.shape[1])}) kernel_cache[key] = kernel_softmax.compile(thread) kernel_cache[key](activations, bias, dest)
def get_test_computation(arr_t): return PureParallel([ Parameter('output', Annotation(arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i')) ], """ <% all_idxs = ", ".join(idxs) %> ${output.store_idx}(${all_idxs}, ${input.load_idx}(${all_idxs})); """)
def _build_trivial_plan(self, plan_factory, output, input_): # Trivial problem. Need to add a dummy kernel # because we still have to run transformations. plan = plan_factory() copy_trf = copy(input_, out_arr_t=output) copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input) plan.computation_call(copy_comp, output, input_) return plan
def _build_plan(self, plan_factory, device_params, array, shift): plan = plan_factory() temp = plan.temp_array_like(array) plan.computation_call(roll_computation(array, self._axis), temp, array, shift) tr = transformations.copy(temp, out_arr_t=array) copy_comp = PureParallel.from_trf(tr, guiding_array=tr.output) plan.computation_call(copy_comp, array, temp) return plan
def test_zero_length_shape(thr): dtype = numpy.float32 p = PureParallel( [ Parameter('output', Annotation(Type(dtype, shape=tuple()), 'o')), Parameter('input', Annotation(Type(dtype, shape=tuple()), 'i'))], """ float t = ${input.load_idx}(); ${output.store_idx}(t * 2); """, guiding_array=tuple()) a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev) res_ref = (a * 2).astype(dtype) assert diff_is_negligible(res_dev.get(), res_ref)
def test_zero_length_shape(thr): dtype = numpy.float32 p = PureParallel([ Parameter('output', Annotation(Type(dtype, shape=tuple()), 'o')), Parameter('input', Annotation(Type(dtype, shape=tuple()), 'i')) ], """ float t = ${input.load_idx}(); ${output.store_idx}(t * 2); """, guiding_array=tuple()) a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev) res_ref = (a * 2).astype(dtype) assert diff_is_negligible(res_dev.get(), res_ref)
def test_tlwe_transformed_add_mul_to_trf(thread): shape = (2, 3) params = NuFHEParameters(transform_type='NTT') perf_params = PerformanceParameters(params).for_device( thread.device_params) tgsw_params = params.tgsw_params decomp_length = tgsw_params.decomp_length mask_size = tgsw_params.tlwe_params.mask_size polynomial_degree = tgsw_params.tlwe_params.polynomial_degree transform_type = tgsw_params.tlwe_params.transform_type transform = get_transform(transform_type) tlength = transform.transformed_length(polynomial_degree) tdtype = transform.transformed_dtype() result_shape = shape + (mask_size + 1, tlength) sample_shape = shape + (mask_size + 1, decomp_length, tlength) bk_len = 10 bootstrap_key_shape = (bk_len, mask_size + 1, decomp_length, mask_size + 1, tlength) bk_row_idx = 2 result = numpy.empty(result_shape, tdtype) sample = get_test_array(sample_shape, 'ff_number') bootstrap_key = get_test_array(bootstrap_key_shape, 'ff_number') result_dev = thread.empty_like(result) sample_dev = thread.to_device(sample) bootstrap_key_dev = thread.to_device(bootstrap_key) trf = get_tlwe_transformed_add_mul_to_trf(tgsw_params, shape, bk_len, perf_params) test = PureParallel.from_trf(trf, guiding_array='result').compile(thread) ref = tlwe_transformed_add_mul_to_trf_reference(tgsw_params, shape, bk_len, perf_params) test(result_dev, sample_dev, bootstrap_key_dev, bk_row_idx) result_test = result_dev.get() ref(result, sample, bootstrap_key, bk_row_idx) if numpy.issubdtype(tdtype, numpy.integer): assert (result == result_test).all() else: assert numpy.allclose(result, result_test)
def __init__(self, size, dtype): Computation.__init__(self, [ Parameter('output', Annotation(Type(dtype, shape=size), 'o')), Parameter('input', Annotation(Type(dtype, shape=size), 'i')) ]) self._p = PureParallel([ Parameter('output', Annotation(Type(dtype, shape=size), 'o')), Parameter('i1', Annotation(Type(dtype, shape=size), 'i')), Parameter('i2', Annotation(Type(dtype, shape=size), 'i')) ], """ ${i1.ctype} t1 = ${i1.load_idx}(${idxs[0]}); ${i2.ctype} t2 = ${i2.load_idx}(${idxs[0]}); ${output.store_idx}(${idxs[0]}, t1 + t2); """)
def Multiply(type): return PureParallel([ Parameter('output', Annotation(type, 'o')), Parameter('in1', Annotation(type, 'i')), Parameter('in2', Annotation(type, 'i')) ], """ ${ctype} f1 = ${in1.load_same}, f2 = ${in2.load_same}; #if ${complex} ${output.store_same}((${ctype})(f1.x*f2.x - f1.y*f2.y, f1.x*f2.y + f1.y*f2.x)); #else ${output.store_same}(f1*f2); #endif """, render_kwds=dict(ctype=type.ctype, complex=int(dtypes.is_complex(type))))
def test_array_offset(thr): dtype = numpy.uint32 itemsize = dtypes.normalize_type(dtype).itemsize offset_len = 10 arr_len = 16 # internal creation of the base array a1 = thr.array((arr_len,), dtype, offset=itemsize * offset_len) # providing base a2_base = thr.array((arr_len + offset_len,), dtype) a2 = thr.array((arr_len,), dtype, offset=itemsize * offset_len, base=a2_base) # providing base_data a3_base = thr.array((arr_len + offset_len,), dtype) a3_data = a3_base.base_data a3 = thr.array((arr_len,), dtype, offset=itemsize * offset_len, base_data=a3_data) fill = PureParallel( [ Parameter('output1', Annotation(a1, 'o')), Parameter('output2', Annotation(a2, 'o')), Parameter('output3', Annotation(a3, 'o')), ], """ ${output1.store_idx}((int)${idxs[0]} - ${offset_len}, ${idxs[0]}); ${output2.store_idx}((int)${idxs[0]} - ${offset_len}, ${idxs[0]}); ${output3.store_idx}((int)${idxs[0]} - ${offset_len}, ${idxs[0]}); """, render_kwds=dict(offset_len=offset_len), guiding_array=(arr_len + offset_len,) ).compile(thr) fill(a1, a2, a3) offset_range = numpy.arange(offset_len, arr_len + offset_len).astype(dtype) full_range = numpy.arange(arr_len + offset_len).astype(dtype) assert diff_is_negligible(a1.get(), offset_range) assert diff_is_negligible(a2_base.get(), full_range) assert diff_is_negligible(a2.get(), offset_range) assert diff_is_negligible(a3_base.get(), full_range) assert diff_is_negligible(a3.get(), offset_range)
def test_array_views(thr): a = get_test_array((6, 8, 10), numpy.int32) a_dev = thr.to_device(a) b_dev = thr.empty_like(a) in_view = a_dev[2:4, ::2, ::-1] out_view = b_dev[4:, 1:5, :] move = PureParallel.from_trf( transformations.copy(in_view, out_arr_t=out_view), guiding_array='output').compile(thr) move(out_view, in_view) b_res = b_dev.get()[4:, 1:5, :] b_ref = a[2:4, ::2, ::-1] assert diff_is_negligible(b_res, b_ref)
def _build_plan( self, plan_factory, device_params, ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b): plan = plan_factory() extracted_n, t, base, inner_n = ks_a.shape mean = Reduce(noises_b, predicate_sum(noises_b.dtype)) norm = transformations.div_const(mean.parameter.output, numpy.prod(noises_b.shape)) mean.parameter.output.connect(norm, norm.input, mean=norm.output) noises_b_mean = plan.temp_array_like(mean.parameter.mean) mul_key = MatrixMulVector(noises_a) b_term = plan.temp_array_like(mul_key.parameter.output) build_keyswitch = PureParallel([ Parameter('ks_a', Annotation(ks_a, 'o')), Parameter('ks_b', Annotation(ks_b, 'o')), Parameter('ks_cv', Annotation(ks_cv, 'o')), Parameter('in_key', Annotation(in_key, 'i')), Parameter('b_term', Annotation(b_term, 'i')), Parameter('noises_a', Annotation(noises_a, 'i')), Parameter('noises_b', Annotation(noises_b, 'i')), Parameter('noises_b_mean', Annotation(noises_b_mean, 'i'))], Snippet( TEMPLATE.get_def("make_lwe_keyswitch_key"), render_kwds=dict( log2_base=self._log2_base, output_size=self._output_size, double_to_t32=double_to_t32_module, noise=self._noise)), guiding_array="ks_b") plan.computation_call(mean, noises_b_mean, noises_b) plan.computation_call(mul_key, b_term, noises_a, out_key) plan.computation_call( build_keyswitch, ks_a, ks_b, ks_cv, in_key, b_term, noises_a, noises_b, noises_b_mean) return plan
def _build_plan(self, plan_factory, device_params, result, phase): plan = plan_factory() tr = Transformation([ Parameter('result', Annotation(result, 'o')), Parameter('phase', Annotation(phase, 'i')), ], """ <% interv = 2**32 // mspace_size half_interv = interv // 2 %> ${phase.ctype} phase = ${phase.load_same}; ${result.store_same}(((unsigned int)phase + ${half_interv}) / ${interv}); """, render_kwds=dict(mspace_size=self._mspace_size, uint64=dtypes.ctype( numpy.uint64)), connectors=['result', 'phase']) plan.computation_call( PureParallel.from_trf(tr, guiding_array='result'), result, phase) return plan
def get_procs(thr, N): fft = FFTFactory.create(thr, (N,), compile_=False) unimod_trans = Transformation( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i'))], """ VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_same}; if (idx>${N}/2){ val.x = 0.0; val.y = 0.0; ${output.store_same}(val); }else ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); """, render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64), N=N) ) fft.parameter.output.connect(unimod_trans, unimod_trans.input, uni=unimod_trans.output) fft_unimod = fft.compile(thr) mag_square = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_idx}(idx); val.x = val.x*val.x + val.y*val.y; val.y = 0; ${output.store_idx}(idx, val); ''' ) mag_square = mag_square.compile(thr) apply_mask = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('origin', Annotation(Type(np.complex128, N), 'i')), Parameter('mask', Annotation(Type(np.double, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; ${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx))); ''', render_kwds=dict(mul=functions.mul(np.complex128, np.double)) ) apply_mask = apply_mask.compile(thr) combine_mag_phi = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')), Parameter('phase', Annotation(Type(np.complex128, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; double r = ${mag_square.load_idx}(idx).x; r = r<0.0 ? 0.0 : ${pow}(r, 0.5); double2 v = ${phase.load_idx}(idx); double angle = atan2(v.y, v.x); ${output.store_idx}(idx, ${polar}(r, angle)); ''', render_kwds=dict(pow=functions.pow(np.double), polar=functions.polar(np.double)) ) combine_mag_phi = combine_mag_phi.compile(thr) return fft_unimod, mag_square, apply_mask, combine_mag_phi
def convolve2d_gradient(ctx, prev_deltas, deltas, gradient_intermediate): """ The output is the full discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_gradient, prev_deltas.shape, deltas.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) # Extract shapes from the arrays n, channels, p_width, p_height = prev_deltas.shape n_1, filters, d_width, d_height = deltas.shape n, d_width_1, d_height_1, channels_1, filters_1, f_width, f_height = gradient_intermediate.shape # Some assertions to be sure everything is correct assert n_1 == n assert filters_1 == filters assert channels_1 == channels expected_shape = get_output_shape(prev_deltas, deltas, 'gradient') assert expected_shape == gradient_intermediate.shape assert d_width_1 == d_width assert d_height_1 == d_height # Render keywords render_kwds = { 'n': n, 'filters': filters, 'channels': channels, 'f_width': f_width, 'f_height': f_height, 'd_width': d_width, 'd_height': d_height, 'p_width': p_width, 'p_height': p_height, } # The kernel kernel = PureParallel([ Parameter('prev_deltas', Annotation(prev_deltas, 'i')), Parameter('deltas', Annotation(deltas, 'i')), Parameter('gradient_intermediate', Annotation(gradient_intermediate, 'o')) ], """ const SIZE_T number = ${idxs[0]}; const SIZE_T dx = ${idxs[1]}; const SIZE_T dy = ${idxs[2]}; const SIZE_T channel = ${idxs[3]}; const SIZE_T filter = ${idxs[4]}; const SIZE_T fx = ${idxs[5]}; const SIZE_T fy = ${idxs[6]}; // weight gradient at the weight position fx, fy is defined by the sum // // (deltas * prev_deltas[fx:d_width+fx, fy:fy+d_height]).sum() // // alternatively we can store all delta positions and sum in a separate kernel - this is what we do now. float g = ${deltas.load_idx}(number, filter, dx, dy) * ${prev_deltas.load_idx}(number, channel, dx+fx, dy+fy); ${gradient_intermediate.store_same}(g); """, guiding_array='gradient_intermediate', render_kwds=render_kwds) kernel_cache[key] = kernel.compile(thread, fast_math=True) # run convolution -> intermediate kernel_cache[key](prev_deltas, deltas, gradient_intermediate) return gradient_intermediate
def convolve2d_gradient(ctx, prev_deltas, deltas, gradient_intermediate): """ The output is the full discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_gradient, prev_deltas.shape, deltas.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) # Extract shapes from the arrays n, channels, p_width, p_height = prev_deltas.shape n_1, filters, d_width, d_height = deltas.shape n, d_width_1, d_height_1, channels_1, filters_1, f_width, f_height = gradient_intermediate.shape # Some assertions to be sure everything is correct assert n_1 == n assert filters_1 == filters assert channels_1 == channels expected_shape = get_output_shape(prev_deltas, deltas, 'gradient') assert expected_shape == gradient_intermediate.shape assert d_width_1 == d_width assert d_height_1 == d_height # Render keywords render_kwds = { 'n':n, 'filters':filters, 'channels': channels, 'f_width': f_width, 'f_height': f_height, 'd_width': d_width, 'd_height': d_height, 'p_width': p_width, 'p_height': p_height, } # The kernel kernel = PureParallel( [ Parameter('prev_deltas', Annotation(prev_deltas, 'i')), Parameter('deltas', Annotation(deltas, 'i')), Parameter('gradient_intermediate', Annotation(gradient_intermediate, 'o')) ], """ const SIZE_T number = ${idxs[0]}; const SIZE_T dx = ${idxs[1]}; const SIZE_T dy = ${idxs[2]}; const SIZE_T channel = ${idxs[3]}; const SIZE_T filter = ${idxs[4]}; const SIZE_T fx = ${idxs[5]}; const SIZE_T fy = ${idxs[6]}; // weight gradient at the weight position fx, fy is defined by the sum // // (deltas * prev_deltas[fx:d_width+fx, fy:fy+d_height]).sum() // // alternatively we can store all delta positions and sum in a separate kernel - this is what we do now. float g = ${deltas.load_idx}(number, filter, dx, dy) * ${prev_deltas.load_idx}(number, channel, dx+fx, dy+fy); ${gradient_intermediate.store_same}(g); """, guiding_array='gradient_intermediate', render_kwds=render_kwds) kernel_cache[key] = kernel.compile( thread, fast_math=True) # run convolution -> intermediate kernel_cache[key](prev_deltas, deltas, gradient_intermediate) return gradient_intermediate
def convolve2d_backprop(ctx, deltas, weights, deltas_intermediate): """ The output is the full discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_backprop, deltas.shape, weights.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) # Extract shapes from the arrays channels, filters, f_width, f_height = weights.shape n_1, filters_1, d_width, d_height = deltas.shape n, channels_1, filters_2, p_width, p_height = deltas_intermediate.shape # Some assertions to be sure everything is correct assert n_1 == n assert filters_2 == filters_1 == filters assert channels_1 == channels expected_shape = get_output_shape(deltas, weights, 'backprop') assert expected_shape == deltas_intermediate.shape # Render keywords render_kwds = { 'n':n, 'filters':filters, 'channels': channels, 'f_width': f_width, 'f_height': f_height, 'd_width': d_width, 'd_height': d_height, 'p_width': p_width, 'p_height': p_height, } # The kernel kernel = PureParallel( [ Parameter('deltas', Annotation(deltas, 'i')), Parameter('weights', Annotation(weights, 'i')), Parameter('deltas_intermediate', Annotation(deltas_intermediate, 'o')) ], """ float d = 0.0f; SIZE_T x, y, i, j, fi, fj; const SIZE_T number = ${idxs[0]}; const SIZE_T channel = ${idxs[1]}; const SIZE_T filter = ${idxs[2]}; const SIZE_T xout = ${idxs[3]}; const SIZE_T yout = ${idxs[4]}; for (i=0; i < ${f_width}; i++){ for (j=0; j < ${f_height}; j++){ x = xout - i; if (x < 0) continue; if (x >= ${d_width}) continue; y = yout - j; if (y < 0) continue; if (y >= ${d_height}) continue; // acces weights in flipped order! fi = ${f_width} - i - 1; fj = ${f_height} - j - 1; d += ${deltas.load_idx}(number, channel, x, y) * ${weights.load_idx}(channel, filter, fi, fj); } } ${deltas_intermediate.store_same}(d); """, guiding_array='deltas_intermediate', render_kwds=render_kwds) kernel_cache[key] = kernel.compile( thread, fast_math=True) # run convolution -> intermediate kernel_cache[key](deltas, weights, deltas_intermediate) return deltas_intermediate