def _fp_convert(src_data, src_type, dest_tensor, reduce_shape): if reduce_shape: kernel = _get_reduce_kernel(dest_tensor.dtype.str[1:]) blocks = _ceil_div(reduce_shape[1], 32) kernel.prepared_async_call((blocks, 1, 1), (32, 1, 1), dest_tensor.backend.stream, dest_tensor.gpudata, src_data, reduce_shape[1], reduce_shape[0]*reduce_shape[1]) else: from neon.backends.nervanagpu import GPUTensor from neon.backends.float_ew import _get_compound_kernel, _get_fast_ew_dims # quick wrapper to convert raw fp32 scratch data to a destination tensor shape, strides = _get_fast_ew_dims(dest_tensor.size) kernel_args = [0, dest_tensor.gpudata, strides[0], strides[1], src_data, strides[0], strides[1], shape[1]] kernel = _get_compound_kernel(( (GPUTensor, 0, dest_tensor.dtype.str[1:], 0, False), (GPUTensor, 1, src_type, 0, False), ('assign', 0, False, 32)), dest_tensor.backend.compute_capability) kernel.prepared_async_call((shape[0], 1, 1), (32, 1, 1), dest_tensor.backend.stream, *kernel_args)
def _fp_convert(src_data, src_type, dest_tensor, reduce_shape): if reduce_shape: kernel = _get_reduce_kernel(dest_tensor.dtype.str[1:]) blocks = _ceil_div(reduce_shape[1], 32) kernel.prepared_async_call( (blocks, 1, 1), (32, 1, 1), dest_tensor.backend.stream, dest_tensor.gpudata, src_data, reduce_shape[1], reduce_shape[0] * reduce_shape[1]) else: from neon.backends.nervanagpu import GPUTensor from neon.backends.float_ew import _get_compound_kernel, _get_fast_ew_dims # quick wrapper to convert raw fp32 scratch data to a destination tensor shape, strides = _get_fast_ew_dims(dest_tensor.size) kernel_args = [ 0, dest_tensor.gpudata, strides[0], strides[1], src_data, strides[0], strides[1], shape[1] ] kernel = _get_compound_kernel( ((GPUTensor, 0, dest_tensor.dtype.str[1:], 0, False), (GPUTensor, 1, src_type, 0, False), ('assign', 0, False, 32)), dest_tensor.backend.compute_capability) kernel.prepared_async_call((shape[0], 1, 1), (32, 1, 1), dest_tensor.backend.stream, *kernel_args)