示例#1
0
def _fp_convert(src_data, src_type, dest_tensor, reduce_shape):

    if reduce_shape:

        kernel = _get_reduce_kernel(dest_tensor.dtype.str[1:])
        blocks = _ceil_div(reduce_shape[1], 32)
        kernel.prepared_async_call((blocks, 1, 1), (32, 1, 1),
                                   dest_tensor.backend.stream,
                                   dest_tensor.gpudata,
                                   src_data,
                                   reduce_shape[1],
                                   reduce_shape[0]*reduce_shape[1])

    else:
        from neon.backends.nervanagpu import GPUTensor
        from neon.backends.float_ew import _get_compound_kernel, _get_fast_ew_dims

        # quick wrapper to convert raw fp32 scratch data to a destination tensor
        shape, strides = _get_fast_ew_dims(dest_tensor.size)
        kernel_args = [0,
                       dest_tensor.gpudata, strides[0], strides[1],
                       src_data, strides[0], strides[1],
                       shape[1]]

        kernel = _get_compound_kernel((
            (GPUTensor, 0, dest_tensor.dtype.str[1:], 0, False),
            (GPUTensor, 1, src_type, 0, False),
            ('assign', 0, False, 32)),
            dest_tensor.backend.compute_capability)
        kernel.prepared_async_call((shape[0], 1, 1),
                                   (32, 1, 1),
                                   dest_tensor.backend.stream,
                                   *kernel_args)
示例#2
0
def _fp_convert(src_data, src_type, dest_tensor, reduce_shape):

    if reduce_shape:

        kernel = _get_reduce_kernel(dest_tensor.dtype.str[1:])
        blocks = _ceil_div(reduce_shape[1], 32)
        kernel.prepared_async_call(
            (blocks, 1, 1), (32, 1, 1), dest_tensor.backend.stream,
            dest_tensor.gpudata, src_data, reduce_shape[1],
            reduce_shape[0] * reduce_shape[1])

    else:
        from neon.backends.nervanagpu import GPUTensor
        from neon.backends.float_ew import _get_compound_kernel, _get_fast_ew_dims

        # quick wrapper to convert raw fp32 scratch data to a destination tensor
        shape, strides = _get_fast_ew_dims(dest_tensor.size)
        kernel_args = [
            0, dest_tensor.gpudata, strides[0], strides[1], src_data,
            strides[0], strides[1], shape[1]
        ]

        kernel = _get_compound_kernel(
            ((GPUTensor, 0, dest_tensor.dtype.str[1:], 0, False),
             (GPUTensor, 1, src_type, 0, False), ('assign', 0, False, 32)),
            dest_tensor.backend.compute_capability)
        kernel.prepared_async_call((shape[0], 1, 1), (32, 1, 1),
                                   dest_tensor.backend.stream, *kernel_args)