def test_no_sync_supplied_stream(self): # There should not be a synchronization when a stream is supplied for # the setitem call, whether it is the default stream, the legacy default # stream, the per-thread default stream, or another stream. streams = (cuda.stream(), cuda.default_stream(), cuda.legacy_default_stream(), cuda.per_thread_default_stream()) for stream in streams: darr = cuda.to_device(np.arange(5)) with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', return_value=None) as mock_sync: darr.setitem(0, 10, stream=stream) mock_sync.assert_not_called()
def test_no_sync_default_stream(self): # There should not be a synchronization when the array has a default # stream, whether it is the default stream, the legacy default stream, # the per-thread default stream, or another stream. streams = (cuda.stream(), cuda.default_stream(), cuda.legacy_default_stream(), cuda.per_thread_default_stream()) for stream in streams: darr = cuda.to_device(np.arange(5), stream=stream) with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', return_value=None) as mock_sync: darr[0] = 10 mock_sync.assert_not_called()
def child_test(): from numba import cuda, int32, void from numba.core import config import io import numpy as np import threading # Enable PTDS before we make any CUDA driver calls. Enabling it first # ensures that PTDS APIs are used because the CUDA driver looks up API # functions on first use and memoizes them. config.CUDA_PER_THREAD_DEFAULT_STREAM = 1 # Set up log capture for the Driver API so we can see what API calls were # used. logbuf = io.StringIO() handler = logging.StreamHandler(logbuf) cudadrv_logger = logging.getLogger('numba.cuda.cudadrv.driver') cudadrv_logger.addHandler(handler) cudadrv_logger.setLevel(logging.DEBUG) # Set up data for our test, and copy over to the device N = 2 ** 16 N_THREADS = 10 N_ADDITIONS = 4096 # Seed the RNG for repeatability np.random.seed(1) x = np.random.randint(low=0, high=1000, size=N, dtype=np.int32) r = np.zeros_like(x) # One input and output array for each thread xs = [cuda.to_device(x) for _ in range(N_THREADS)] rs = [cuda.to_device(r) for _ in range(N_THREADS)] # Compute the grid size and get the [per-thread] default stream n_threads = 256 n_blocks = N // n_threads stream = cuda.default_stream() # A simple multiplication-by-addition kernel. What it does exactly is not # too important; only that we have a kernel that does something. @cuda.jit(void(int32[::1], int32[::1])) def f(r, x): i = cuda.grid(1) if i > len(r): return # Accumulate x into r for j in range(N_ADDITIONS): r[i] += x[i] # This function will be used to launch the kernel from each thread on its # own unique data. def kernel_thread(n): f[n_blocks, n_threads, stream](rs[n], xs[n]) # Create threads threads = [threading.Thread(target=kernel_thread, args=(i,)) for i in range(N_THREADS)] # Start all threads for thread in threads: thread.start() # Wait for all threads to finish, to ensure that we don't synchronize with # the device until all kernels are scheduled. for thread in threads: thread.join() # Synchronize with the device cuda.synchronize() # Check output is as expected expected = x * N_ADDITIONS for i in range(N_THREADS): np.testing.assert_equal(rs[i].copy_to_host(), expected) # Return the driver log output to the calling process for checking handler.flush() return logbuf.getvalue()
assert hb == hb2 # out-of-band if pickle.HIGHEST_PROTOCOL >= 5: db = rmm.DeviceBuffer.to_device(hb) buffers = [] pb2 = pickle.dumps(db, protocol=5, buffer_callback=buffers.append) del db assert len(buffers) == 1 assert isinstance(buffers[0], pickle.PickleBuffer) assert bytes(buffers[0]) == hb db3 = pickle.loads(pb2, buffers=buffers) hb3 = db3.tobytes() assert hb3 == hb @pytest.mark.parametrize("stream", [cuda.default_stream(), cuda.stream()]) def test_rmm_pool_numba_stream(stream): rmm.reinitialize(pool_allocator=True) stream = rmm._cuda.stream.Stream(stream) a = rmm._lib.device_buffer.DeviceBuffer(size=3, stream=stream) assert a.size == 3 assert a.ptr != 0 def test_rmm_cupy_allocator(): cupy = pytest.importorskip("cupy") m = rmm.rmm_cupy_allocator(42) assert m.mem.size == 42
def rnnt_loss_gpu( acts: torch.Tensor, labels: torch.Tensor, input_lengths: torch.Tensor, label_lengths: torch.Tensor, costs: torch.Tensor, grads: torch.Tensor, blank_label: int, fastemit_lambda: float, clamp: float, num_threads: int, ): """ Wrapper method for accessing GPU RNNT loss. CUDA implementation ported from [HawkAaron/warp-transducer](https://github.com/HawkAaron/warp-transducer). Args: acts: Activation tensor of shape [B, T, U, V+1]. labels: Ground truth labels of shape [B, U]. input_lengths: Lengths of the acoustic sequence as a vector of ints [B]. label_lengths: Lengths of the target sequence as a vector of ints [B]. costs: Zero vector of length [B] in which costs will be set. grads: Zero tensor of shape [B, T, U, V+1] where the gradient will be set. blank_label: Index of the blank token in the vocabulary. fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. num_threads: Number of threads for OpenMP. """ minibatch_size = acts.shape[0] maxT = acts.shape[1] maxU = acts.shape[2] alphabet_size = acts.shape[3] if hasattr(cuda, 'external_stream'): stream = cuda.external_stream( torch.cuda.current_stream(acts.device).cuda_stream) else: stream = cuda.default_stream() if num_threads < 0: num_threads = multiprocessing.cpu_count() num_threads = max(1, num_threads) # have to use at least 1 thread gpu_size, status = rnnt_helper.get_workspace_size(maxT, maxU, minibatch_size, gpu=True) if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: raise RuntimeError( "Invalid parameter passed when calculating working space memory") # Select GPU index cuda.select_device(acts.device.index) gpu_workspace = torch.zeros(gpu_size, device=acts.device, dtype=acts.dtype, requires_grad=False) ### VIEW TENSORS AS VECTORS FOR POINTER INDEXING ### acts, acts_shape = rnnt_helper.flatten_tensor(acts) wrapper = gpu_rnnt.GPURNNT( minibatch=minibatch_size, maxT=maxT, maxU=maxU, alphabet_size=alphabet_size, workspace=gpu_workspace, blank=blank_label, fastemit_lambda=fastemit_lambda, clamp=clamp, num_threads=num_threads, stream=stream, ) if grads is None: status = wrapper.score_forward( acts=acts.data, costs=costs.data, pad_labels=labels.data, label_lengths=label_lengths.data, input_lengths=input_lengths.data, ) if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: raise RuntimeError("Could not calculate forward scores") else: ### FLATTEN GRAD TENSOR ### grads, grads_shape = rnnt_helper.flatten_tensor(grads) status = wrapper.cost_and_grad( acts=acts.data, grads=grads.data, costs=costs.data, pad_labels=labels.data, label_lengths=label_lengths.data, input_lengths=input_lengths.data, ) if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: raise RuntimeError("Could not calculate forward scores") del gpu_workspace, wrapper return True
def test_compute_alphas_kernel(self): numba_utils.skip_numba_cuda_test_if_unsupported( __NUMBA_MINIMUM_VERSION__) random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape # Numpy kernel x = random.randn(*original_shape) labels = np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]]) # [1, 10] label_len = len(labels[0]) + 1 blank_idx = 0 x_np = log_softmax(x, axis=-1) ground_alphas, ground_log_likelihood = rnnt_numpy.forward_pass( x_np[0, :, :label_len, :], labels[0, :label_len - 1], blank_idx) # Pytorch kernel device = torch.device('cuda') if hasattr(cuda, 'external_stream'): stream = cuda.external_stream( torch.cuda.current_stream(device).cuda_stream) else: stream = cuda.default_stream() x_c = torch.tensor(x, device=device, dtype=torch.float32) labels_c = torch.tensor(labels, device=device, dtype=torch.int32) # Allocate workspace memory denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) llForward = torch.zeros(B, device=device, dtype=x_c.dtype) input_lengths = torch.tensor([T], dtype=torch.int32, device=device) label_lengths = torch.tensor([len(labels[0])], dtype=torch.int32, device=device) # certify input data certify_inputs(x_c, labels_c, input_lengths, label_lengths) # flatten activation tensor (for pointer based indexing) x_c = x_c.view([-1]) # call kernel # log softmax reduction reduce.reduce_max(x_c, denom, rows=V, cols=B * T * U, minus=False, stream=stream) reduce.reduce_exp(x_c, denom, rows=V, cols=B * T * U, minus=True, stream=stream) # alpha kernel gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0]( x_c, denom, alphas, llForward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, ) # sync kernel stream.synchronize() # reshape alphas alphas = alphas.view([B, T, U]) diff = ground_alphas - alphas[0].cpu().numpy() assert np.abs(diff).mean() <= 1e-5 assert np.square(diff).mean() <= 1e-10 ll_diff = ground_log_likelihood - llForward[0].cpu().numpy() assert np.abs(ll_diff).mean() <= 1e-5 assert np.square(ll_diff).mean() <= 1e-10
import numpy @cuda.jit() def add_const(arr, value): x = cuda.grid(1) if x < arr.size: arr[x] += value comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() # Send-Recv if rank == 0: buf = cuda.device_array((20,), dtype='f') buf[:] = range(20) block = 32 grid = (buf.size + block - 1)//block add_const[grid, block](buf, 100) # always make sure the GPU buffer is ready before any MPI operation cuda.default_stream().synchronize() comm.Send(buf, dest=1, tag=77) else: buf = cuda.device_array((20,), dtype='f') cuda.default_stream().synchronize() comm.Recv(buf, source=0, tag=77) buf = buf.copy_to_host() assert numpy.allclose(buf, 100+numpy.arange(20, dtype='f'))
def test_compute_grads_kernel_clamp(self): numba_utils.skip_numba_cuda_test_if_unsupported( __NUMBA_MINIMUM_VERSION__) fastemit_lambda = 0.0 clamp = 0.1 random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape # Numpy kernel x = random.randn(*original_shape) labels = torch.from_numpy( np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]], dtype=np.int32)) # [1, 10] audio_len = torch.from_numpy(np.array([T], dtype=np.int32)) label_len = torch.from_numpy(np.array([U - 1], dtype=np.int32)) blank_idx = 0 x_np = torch.from_numpy(x) x_np.requires_grad_(True) """ Here we will directly utilize the numpy variant of the loss without explicitly calling the numpy functions for alpha, beta and grads. This is because the grads returned by the rnnt_numpy.transduce_batch() are : d/dx (alpha + beta alignment)(log_softmax(x)). But according to the chain rule, we'd still need to compute the gradient of log_softmax(x) and update the alignments by hand. Instead, we will rely on pytorch to compute the gradient of the log_softmax(x) step and propagate it backwards. """ loss_func = rnnt_numpy.RNNTLoss(blank_idx, fastemit_lambda=fastemit_lambda, clamp=clamp) loss_val = loss_func(x_np, labels, audio_len, label_len) loss_val.sum().backward() true_grads = x_np.grad # Pytorch kernel device = torch.device('cuda') if hasattr(cuda, 'external_stream'): stream = cuda.external_stream( torch.cuda.current_stream(device).cuda_stream) else: stream = cuda.default_stream() x_c = torch.tensor(x, device=device, dtype=torch.float32) labels_c = torch.tensor(labels, device=device, dtype=torch.int32) # Allocate workspace memory denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) betas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) llForward = torch.zeros(B, device=device, dtype=x_c.dtype) llBackward = torch.zeros(B, device=device, dtype=x_c.dtype) input_lengths = torch.tensor([T], dtype=torch.int32, device=device) label_lengths = torch.tensor([len(labels[0])], dtype=torch.int32, device=device) # certify input data certify_inputs(x_c, labels_c, input_lengths, label_lengths) # flatten activation tensor (for pointer based indexing) x_c = x_c.view([-1]) grads = torch.zeros_like(x_c, requires_grad=False) # call kernel # log softmax reduction reduce.reduce_max(x_c, denom, rows=V, cols=B * T * U, minus=False, stream=stream) reduce.reduce_exp(x_c, denom, rows=V, cols=B * T * U, minus=True, stream=stream) # alpha kernel gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0]( x_c, denom, alphas, llForward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, ) # beta kernel gpu_rnnt_kernel.compute_betas_kernel[B, U, stream, 0]( x_c, denom, betas, llBackward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, ) # gamma kernel grad_blocks_per_grid = B * T * U grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE gpu_rnnt_kernel.compute_grad_kernel[grad_blocks_per_grid, grad_threads_per_block, stream, 0]( grads, x_c, denom, alphas, betas, llForward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, fastemit_lambda, clamp, ) # sync kernel stream.synchronize() # reshape grads grads = grads.view([B, T, U, V]) diff = true_grads - grads[0].cpu().numpy() assert np.abs(diff).mean() <= 1e-5 assert np.square(diff).mean() <= 1e-10