def test_no_sync_supplied_stream(self):
        # There should not be a synchronization when a stream is supplied for
        # the setitem call, whether it is the default stream, the legacy default
        # stream, the per-thread default stream, or another stream.
        streams = (cuda.stream(), cuda.default_stream(),
                   cuda.legacy_default_stream(),
                   cuda.per_thread_default_stream())

        for stream in streams:
            darr = cuda.to_device(np.arange(5))

            with patch.object(cuda.cudadrv.driver.Stream,
                              'synchronize',
                              return_value=None) as mock_sync:
                darr.setitem(0, 10, stream=stream)

            mock_sync.assert_not_called()
    def test_no_sync_default_stream(self):
        # There should not be a synchronization when the array has a default
        # stream, whether it is the default stream, the legacy default stream,
        # the per-thread default stream, or another stream.
        streams = (cuda.stream(), cuda.default_stream(),
                   cuda.legacy_default_stream(),
                   cuda.per_thread_default_stream())

        for stream in streams:
            darr = cuda.to_device(np.arange(5), stream=stream)

            with patch.object(cuda.cudadrv.driver.Stream,
                              'synchronize',
                              return_value=None) as mock_sync:
                darr[0] = 10

            mock_sync.assert_not_called()
Exemplo n.º 3
0
def child_test():
    from numba import cuda, int32, void
    from numba.core import config
    import io
    import numpy as np
    import threading

    # Enable PTDS before we make any CUDA driver calls.  Enabling it first
    # ensures that PTDS APIs are used because the CUDA driver looks up API
    # functions on first use and memoizes them.
    config.CUDA_PER_THREAD_DEFAULT_STREAM = 1

    # Set up log capture for the Driver API so we can see what API calls were
    # used.
    logbuf = io.StringIO()
    handler = logging.StreamHandler(logbuf)
    cudadrv_logger = logging.getLogger('numba.cuda.cudadrv.driver')
    cudadrv_logger.addHandler(handler)
    cudadrv_logger.setLevel(logging.DEBUG)

    # Set up data for our test, and copy over to the device
    N = 2 ** 16
    N_THREADS = 10
    N_ADDITIONS = 4096

    # Seed the RNG for repeatability
    np.random.seed(1)
    x = np.random.randint(low=0, high=1000, size=N, dtype=np.int32)
    r = np.zeros_like(x)

    # One input and output array for each thread
    xs = [cuda.to_device(x) for _ in range(N_THREADS)]
    rs = [cuda.to_device(r) for _ in range(N_THREADS)]

    # Compute the grid size and get the [per-thread] default stream
    n_threads = 256
    n_blocks = N // n_threads
    stream = cuda.default_stream()

    # A simple multiplication-by-addition kernel. What it does exactly is not
    # too important; only that we have a kernel that does something.
    @cuda.jit(void(int32[::1], int32[::1]))
    def f(r, x):
        i = cuda.grid(1)

        if i > len(r):
            return

        # Accumulate x into r
        for j in range(N_ADDITIONS):
            r[i] += x[i]

    # This function will be used to launch the kernel from each thread on its
    # own unique data.
    def kernel_thread(n):
        f[n_blocks, n_threads, stream](rs[n], xs[n])

    # Create threads
    threads = [threading.Thread(target=kernel_thread, args=(i,))
               for i in range(N_THREADS)]

    # Start all threads
    for thread in threads:
        thread.start()

    # Wait for all threads to finish, to ensure that we don't synchronize with
    # the device until all kernels are scheduled.
    for thread in threads:
        thread.join()

    # Synchronize with the device
    cuda.synchronize()

    # Check output is as expected
    expected = x * N_ADDITIONS
    for i in range(N_THREADS):
        np.testing.assert_equal(rs[i].copy_to_host(), expected)

    # Return the driver log output to the calling process for checking
    handler.flush()
    return logbuf.getvalue()
Exemplo n.º 4
0
    assert hb == hb2
    # out-of-band
    if pickle.HIGHEST_PROTOCOL >= 5:
        db = rmm.DeviceBuffer.to_device(hb)
        buffers = []
        pb2 = pickle.dumps(db, protocol=5, buffer_callback=buffers.append)
        del db
        assert len(buffers) == 1
        assert isinstance(buffers[0], pickle.PickleBuffer)
        assert bytes(buffers[0]) == hb
        db3 = pickle.loads(pb2, buffers=buffers)
        hb3 = db3.tobytes()
        assert hb3 == hb


@pytest.mark.parametrize("stream", [cuda.default_stream(), cuda.stream()])
def test_rmm_pool_numba_stream(stream):
    rmm.reinitialize(pool_allocator=True)

    stream = rmm._cuda.stream.Stream(stream)
    a = rmm._lib.device_buffer.DeviceBuffer(size=3, stream=stream)

    assert a.size == 3
    assert a.ptr != 0


def test_rmm_cupy_allocator():
    cupy = pytest.importorskip("cupy")

    m = rmm.rmm_cupy_allocator(42)
    assert m.mem.size == 42
Exemplo n.º 5
0
def rnnt_loss_gpu(
    acts: torch.Tensor,
    labels: torch.Tensor,
    input_lengths: torch.Tensor,
    label_lengths: torch.Tensor,
    costs: torch.Tensor,
    grads: torch.Tensor,
    blank_label: int,
    fastemit_lambda: float,
    clamp: float,
    num_threads: int,
):
    """
    Wrapper method for accessing GPU RNNT loss.

    CUDA implementation ported from [HawkAaron/warp-transducer](https://github.com/HawkAaron/warp-transducer).

    Args:
        acts: Activation tensor of shape [B, T, U, V+1].
        labels: Ground truth labels of shape [B, U].
        input_lengths: Lengths of the acoustic sequence as a vector of ints [B].
        label_lengths: Lengths of the target sequence as a vector of ints [B].
        costs: Zero vector of length [B] in which costs will be set.
        grads: Zero tensor of shape [B, T, U, V+1] where the gradient will be set.
        blank_label: Index of the blank token in the vocabulary.
        fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to
            FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization.
        clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp].
        num_threads: Number of threads for OpenMP.
    """
    minibatch_size = acts.shape[0]
    maxT = acts.shape[1]
    maxU = acts.shape[2]
    alphabet_size = acts.shape[3]

    if hasattr(cuda, 'external_stream'):
        stream = cuda.external_stream(
            torch.cuda.current_stream(acts.device).cuda_stream)
    else:
        stream = cuda.default_stream()

    if num_threads < 0:
        num_threads = multiprocessing.cpu_count()

    num_threads = max(1, num_threads)  # have to use at least 1 thread

    gpu_size, status = rnnt_helper.get_workspace_size(maxT,
                                                      maxU,
                                                      minibatch_size,
                                                      gpu=True)
    if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS:
        raise RuntimeError(
            "Invalid parameter passed when calculating working space memory")

    # Select GPU index
    cuda.select_device(acts.device.index)
    gpu_workspace = torch.zeros(gpu_size,
                                device=acts.device,
                                dtype=acts.dtype,
                                requires_grad=False)

    ### VIEW TENSORS AS VECTORS FOR POINTER INDEXING ###
    acts, acts_shape = rnnt_helper.flatten_tensor(acts)

    wrapper = gpu_rnnt.GPURNNT(
        minibatch=minibatch_size,
        maxT=maxT,
        maxU=maxU,
        alphabet_size=alphabet_size,
        workspace=gpu_workspace,
        blank=blank_label,
        fastemit_lambda=fastemit_lambda,
        clamp=clamp,
        num_threads=num_threads,
        stream=stream,
    )

    if grads is None:
        status = wrapper.score_forward(
            acts=acts.data,
            costs=costs.data,
            pad_labels=labels.data,
            label_lengths=label_lengths.data,
            input_lengths=input_lengths.data,
        )

        if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS:
            raise RuntimeError("Could not calculate forward scores")

    else:
        ### FLATTEN GRAD TENSOR ###
        grads, grads_shape = rnnt_helper.flatten_tensor(grads)

        status = wrapper.cost_and_grad(
            acts=acts.data,
            grads=grads.data,
            costs=costs.data,
            pad_labels=labels.data,
            label_lengths=label_lengths.data,
            input_lengths=input_lengths.data,
        )

        if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS:
            raise RuntimeError("Could not calculate forward scores")

    del gpu_workspace, wrapper
    return True
Exemplo n.º 6
0
    def test_compute_alphas_kernel(self):
        numba_utils.skip_numba_cuda_test_if_unsupported(
            __NUMBA_MINIMUM_VERSION__)

        random = np.random.RandomState(0)
        original_shape = [1, 5, 11, 3]
        B, T, U, V = original_shape

        # Numpy kernel
        x = random.randn(*original_shape)
        labels = np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]])  # [1, 10]
        label_len = len(labels[0]) + 1
        blank_idx = 0

        x_np = log_softmax(x, axis=-1)
        ground_alphas, ground_log_likelihood = rnnt_numpy.forward_pass(
            x_np[0, :, :label_len, :], labels[0, :label_len - 1], blank_idx)

        # Pytorch kernel
        device = torch.device('cuda')
        if hasattr(cuda, 'external_stream'):
            stream = cuda.external_stream(
                torch.cuda.current_stream(device).cuda_stream)
        else:
            stream = cuda.default_stream()

        x_c = torch.tensor(x, device=device, dtype=torch.float32)
        labels_c = torch.tensor(labels, device=device, dtype=torch.int32)

        # Allocate workspace memory
        denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        llForward = torch.zeros(B, device=device, dtype=x_c.dtype)
        input_lengths = torch.tensor([T], dtype=torch.int32, device=device)
        label_lengths = torch.tensor([len(labels[0])],
                                     dtype=torch.int32,
                                     device=device)

        # certify input data
        certify_inputs(x_c, labels_c, input_lengths, label_lengths)

        # flatten activation tensor (for pointer based indexing)
        x_c = x_c.view([-1])

        # call kernel
        # log softmax reduction
        reduce.reduce_max(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=False,
                          stream=stream)
        reduce.reduce_exp(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=True,
                          stream=stream)

        # alpha kernel
        gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0](
            x_c,
            denom,
            alphas,
            llForward,
            input_lengths,
            label_lengths,
            labels_c,
            B,
            T,
            U,
            V,
            blank_idx,
        )

        # sync kernel
        stream.synchronize()

        # reshape alphas
        alphas = alphas.view([B, T, U])
        diff = ground_alphas - alphas[0].cpu().numpy()

        assert np.abs(diff).mean() <= 1e-5
        assert np.square(diff).mean() <= 1e-10

        ll_diff = ground_log_likelihood - llForward[0].cpu().numpy()

        assert np.abs(ll_diff).mean() <= 1e-5
        assert np.square(ll_diff).mean() <= 1e-10
Exemplo n.º 7
0
import numpy


@cuda.jit()
def add_const(arr, value):
    x = cuda.grid(1)
    if x < arr.size:
        arr[x] += value


comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()

# Send-Recv
if rank == 0:
    buf = cuda.device_array((20,), dtype='f')
    buf[:] = range(20)
    block = 32
    grid = (buf.size + block - 1)//block
    add_const[grid, block](buf, 100)
    # always make sure the GPU buffer is ready before any MPI operation
    cuda.default_stream().synchronize()
    comm.Send(buf, dest=1, tag=77)
else:
    buf = cuda.device_array((20,), dtype='f')
    cuda.default_stream().synchronize()
    comm.Recv(buf, source=0, tag=77)
    buf = buf.copy_to_host()
    assert numpy.allclose(buf, 100+numpy.arange(20, dtype='f'))
Exemplo n.º 8
0
    def test_compute_grads_kernel_clamp(self):
        numba_utils.skip_numba_cuda_test_if_unsupported(
            __NUMBA_MINIMUM_VERSION__)

        fastemit_lambda = 0.0
        clamp = 0.1

        random = np.random.RandomState(0)
        original_shape = [1, 5, 11, 3]
        B, T, U, V = original_shape

        # Numpy kernel
        x = random.randn(*original_shape)
        labels = torch.from_numpy(
            np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]],
                     dtype=np.int32))  # [1, 10]
        audio_len = torch.from_numpy(np.array([T], dtype=np.int32))
        label_len = torch.from_numpy(np.array([U - 1], dtype=np.int32))
        blank_idx = 0

        x_np = torch.from_numpy(x)
        x_np.requires_grad_(True)
        """
        Here we will directly utilize the numpy variant of the loss without explicitly calling
        the numpy functions for alpha, beta and grads. 

        This is because the grads returned by the rnnt_numpy.transduce_batch() are :
        d/dx (alpha + beta alignment)(log_softmax(x)). 
        But according to the chain rule, we'd still need to compute the gradient of log_softmax(x)
        and update the alignments by hand. Instead, we will rely on pytorch to compute the gradient 
        of the log_softmax(x) step and propagate it backwards. 
        """
        loss_func = rnnt_numpy.RNNTLoss(blank_idx,
                                        fastemit_lambda=fastemit_lambda,
                                        clamp=clamp)
        loss_val = loss_func(x_np, labels, audio_len, label_len)
        loss_val.sum().backward()
        true_grads = x_np.grad

        # Pytorch kernel
        device = torch.device('cuda')
        if hasattr(cuda, 'external_stream'):
            stream = cuda.external_stream(
                torch.cuda.current_stream(device).cuda_stream)
        else:
            stream = cuda.default_stream()

        x_c = torch.tensor(x, device=device, dtype=torch.float32)
        labels_c = torch.tensor(labels, device=device, dtype=torch.int32)

        # Allocate workspace memory
        denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        betas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        llForward = torch.zeros(B, device=device, dtype=x_c.dtype)
        llBackward = torch.zeros(B, device=device, dtype=x_c.dtype)
        input_lengths = torch.tensor([T], dtype=torch.int32, device=device)
        label_lengths = torch.tensor([len(labels[0])],
                                     dtype=torch.int32,
                                     device=device)

        # certify input data
        certify_inputs(x_c, labels_c, input_lengths, label_lengths)

        # flatten activation tensor (for pointer based indexing)
        x_c = x_c.view([-1])
        grads = torch.zeros_like(x_c, requires_grad=False)

        # call kernel
        # log softmax reduction
        reduce.reduce_max(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=False,
                          stream=stream)
        reduce.reduce_exp(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=True,
                          stream=stream)

        # alpha kernel
        gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0](
            x_c,
            denom,
            alphas,
            llForward,
            input_lengths,
            label_lengths,
            labels_c,
            B,
            T,
            U,
            V,
            blank_idx,
        )

        # beta kernel
        gpu_rnnt_kernel.compute_betas_kernel[B, U, stream, 0](
            x_c,
            denom,
            betas,
            llBackward,
            input_lengths,
            label_lengths,
            labels_c,
            B,
            T,
            U,
            V,
            blank_idx,
        )

        # gamma kernel
        grad_blocks_per_grid = B * T * U
        grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE
        gpu_rnnt_kernel.compute_grad_kernel[grad_blocks_per_grid,
                                            grad_threads_per_block, stream, 0](
                                                grads,
                                                x_c,
                                                denom,
                                                alphas,
                                                betas,
                                                llForward,
                                                input_lengths,
                                                label_lengths,
                                                labels_c,
                                                B,
                                                T,
                                                U,
                                                V,
                                                blank_idx,
                                                fastemit_lambda,
                                                clamp,
                                            )

        # sync kernel
        stream.synchronize()

        # reshape grads
        grads = grads.view([B, T, U, V])
        diff = true_grads - grads[0].cpu().numpy()

        assert np.abs(diff).mean() <= 1e-5
        assert np.square(diff).mean() <= 1e-10