def test_pytorch_cpu_tensor_to_cu_subvector(self): if kaldi.CudaCompiled(): print('This test is for constructing a CuSubVector from ' 'a CPU tensor') print('Kaldi is compiled with GPU, skip it') return tensor = torch.tensor([10, 20]).float() v = kaldi.CuSubVectorFromDLPack(to_dlpack(tensor)) v.SetZero() # also changes tensor, since memory is shared self.assertEqual(tensor[0], 0) self.assertEqual(tensor[1], 0) v.Add(8) self.assertEqual(tensor[0], 8) del v # memory is shared between `v` and `tensor` v = kaldi.DLPackFloatCuSubVector.from_dlpack(to_dlpack(tensor)) v.Add(100) self.assertEqual(tensor[0], 108)
def test_pytorch_and_kaldi_gpu_tensor_zero_copy(self): # (fangjun): we put all tests in this function to avoid # invoking SelectGpuDevice() twice if torch.cuda.is_available() == False: print('No GPU detected! Skip it') return if kaldi.CudaCompiled() == False: print('Kaldi is not compiled with CUDA! Skip it') return device_id = 0 # Kaldi and PyTorch will use the same GPU kaldi.SelectGpuDevice(device_id=device_id) device = torch.device('cuda', device_id) tensor = torch.arange(3).float() tensor = tensor.to(device) # make sure the tensor from PyTorch is indeed on GPU self.assertTrue(tensor.is_cuda) # GPU data is shared between kaldi::CuSubVector and PyTorch GPU tensor # no data is copied v = kaldi.CuSubVectorFromDLPack(to_dlpack(tensor)) self.assertIsInstance(v, kaldi.FloatCuSubVector) v.Add(value=10) self.assertEqual(tensor[0], 10) self.assertEqual(tensor[1], 11) self.assertEqual(tensor[2], 12) v.Scale(value=6) self.assertEqual(tensor[0], 60) self.assertEqual(tensor[1], 66) self.assertEqual(tensor[2], 72) v.SetZero() self.assertEqual(tensor[0], 0) self.assertEqual(tensor[1], 0) self.assertEqual(tensor[2], 0) # Now for CuSubMatrix tensor = torch.arange(3).reshape(1, 3).float() tensor = tensor.to(device) # make sure the tensor from PyTorch is indeed on GPU self.assertTrue(tensor.is_cuda) m = kaldi.CuSubMatrixFromDLPack(to_dlpack(tensor)) m.ApplyExp() self.assertAlmostEqual(tensor[0, 0], math.exp(0), places=7) self.assertAlmostEqual(tensor[0, 1], math.exp(1), places=7) self.assertAlmostEqual(tensor[0, 2], math.exp(2), places=7) m.SetZero() self.assertEqual(tensor[0, 0], 0) self.assertEqual(tensor[0, 1], 0) self.assertEqual(tensor[0, 2], 0) # now from Kaldi to PyTorch dim = 2 cpu_v = kaldi.FloatVector(size=dim) cpu_v[0] = 10 cpu_v[1] = 20 gpu_v = kaldi.FloatCuVector(cpu_v) self.assertEqual(gpu_v[0], 10) self.assertEqual(gpu_v[1], 20) gpu_v_reference_count = sys.getrefcount(gpu_v) # memory is shared between `gpu_v` and `tensor` tensor = from_dlpack(gpu_v.to_dlpack()) # `gpu_v.to_dlpack()` increases the reference count of `gpu_v` self.assertEqual(gpu_v_reference_count + 1, sys.getrefcount(gpu_v)) self.assertTrue(tensor.is_cuda) self.assertEqual(tensor.device.index, device_id) self.assertTrue(tensor[0], 10) self.assertTrue(tensor[1], 20) tensor[0] = 1 # also changes `gpu_v` tensor[1] = 2 self.assertEqual(gpu_v[0], 1) self.assertEqual(gpu_v[1], 2) gpu_v.Add(10) # also changes `tensor` self.assertEqual(tensor[0], 11) self.assertEqual(tensor[1], 12) del tensor gc.collect() # now the reference count for gpu_v is decreased by one self.assertEqual(gpu_v_reference_count, sys.getrefcount(gpu_v)) self.assertEqual(gpu_v[0], 11) # gpu_v is still alive self.assertEqual(gpu_v[1], 12) # now for CuMatrix num_rows = 1 num_cols = 2 cpu_m = kaldi.FloatMatrix(row=num_rows, col=num_cols) cpu_m[0, 0] = 1 cpu_m[0, 1] = 2 gpu_m = kaldi.FloatCuMatrix(cpu_m) self.assertEqual(gpu_m[0, 0], 1) self.assertEqual(gpu_m[0, 1], 2) gpu_m_reference_count = sys.getrefcount(gpu_m) # memory is shared between `gpu_m` and `tensor` tensor = from_dlpack(gpu_m.to_dlpack()) self.assertEqual(gpu_m_reference_count + 1, sys.getrefcount(gpu_m)) self.assertTrue(tensor.is_cuda) self.assertEqual(tensor.device.index, device_id) self.assertTrue(tensor[0, 0], 1) self.assertTrue(tensor[0, 1], 2) tensor[0, 0] = 6 # also changes `gpu_m` tensor[0, 1] = 8 self.assertEqual(gpu_m[0, 0], 6) self.assertEqual(gpu_m[0, 1], 8) gpu_m.Add(2) # also changes `tensor` self.assertTrue(tensor[0, 0], 8) self.assertTrue(tensor[0, 1], 10) del tensor gc.collect() self.assertEqual(gpu_m_reference_count, sys.getrefcount(gpu_m)) self.assertEqual(gpu_m[0, 0], 8) # `gpu_m` is still alive self.assertEqual(gpu_m[0, 1], 10) # now for CuVector from_dlpack tensor = torch.tensor([1, 2]).float() tensor = tensor.to(device) # memory is shared between `tensor` and `v` v = kaldi.DLPackFloatCuSubVector.from_dlpack(to_dlpack(tensor)) self.assertEqual(v[0], 1) v.Add(1) # also changes `tensor` self.assertEqual(tensor[0], 2) self.assertEqual(tensor[1], 3) del v del tensor # now for CuMatrix from_dlpack tensor = torch.tensor([1, 2]).reshape(1, 2).float() tensor = tensor.to(device) # memory is shared between `tensor` and `m` m = kaldi.DLPackFloatCuSubMatrix.from_dlpack(to_dlpack(tensor)) self.assertEqual(m[0, 0], 1) m.Add(100) # also changes `tensor` self.assertEqual(tensor[0, 0], 101) del m del tensor gc.collect() # now test the issue: https://github.com/pytorch/pytorch/issues/9261 # it will not consume all GPU memory for i in range(100): b = torch.randn(1024 * 1024 * 1024 // 4, 1, device=device) # 1G a = kaldi.CuSubMatrixFromDLPack(to_dlpack(b)) gc.collect() torch.cuda.empty_cache() for i in range(100 * 4): b = kaldi.FloatCuMatrix(1024 * 1024, 64) # 256 MB a = from_dlpack(b.to_dlpack()) gc.collect()
def test_case4(self): device = torch.device('cuda', device_id) # combine case1 to case3 to a minibatch # the first example (a): input_length: 1, label_length: 1 # the second example (c, c): input_length: 3, label_length: 2 # the third example (b, c): input_length: 3, label_length: 2 label_lengths_tensor = torch.tensor([1, 2, 2], dtype=torch.int32) input_lengths_tensor = torch.tensor([1, 3, 3], dtype=torch.int32) alphabet_size = 5 minibatch = 3 info = ctc.CtcOptions() info.loc = ctc.CtcComputeLocation.CTC_GPU info.blank_label = 0 label_lengths = kaldi.IntSubVectorFromDLPack( to_dlpack(label_lengths_tensor)) input_lengths = kaldi.IntSubVectorFromDLPack( to_dlpack(input_lengths_tensor)) status, size_in_bytes = ctc.GetWorkspaceSize( label_lengths=label_lengths, input_lengths=input_lengths, alphabet_size=alphabet_size, minibatch=minibatch, info=info) self.assertEqual(status, ctc.CtcStatus.CTC_STATUS_SUCCESS) num_floats = size_in_bytes // 4 + 1 workspace_tensor = torch.empty( num_floats, dtype=torch.float32).contiguous().to(device) ex1 = torch.tensor([[0.2, 0.2, 0.2, 0.2, 0.2]], dtype=torch.float32) ex2 = torch.tensor( [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], dtype=torch.float32) ex3 = torch.tensor([[-5, -4, -3, -2, -1], [-10, -9, -8, -7, -6], [-15, -14, -13, -12, -11]], dtype=torch.float32) activations_tensor = pad_sequence([ex1, ex2, ex3], batch_first=False) activations_tensor = activations_tensor.contiguous().view(-1).to(device) gradients_tensor = torch.empty_like(activations_tensor) # labels are: (a), (c, c) (b, c) # which are: (1), (3, 3), (2, 3) flat_labels_tensor = torch.tensor([1, 3, 3, 2, 3], dtype=torch.int32) costs_tensor = torch.empty(minibatch, dtype=torch.float32) activations = kaldi.CuSubVectorFromDLPack(to_dlpack(activations_tensor)) gradients = kaldi.CuSubVectorFromDLPack(to_dlpack(gradients_tensor)) flat_labels = kaldi.IntSubVectorFromDLPack( to_dlpack(flat_labels_tensor)) costs = kaldi.FloatSubVectorFromDLPack(to_dlpack(costs_tensor)) workspace = kaldi.CuSubVectorFromDLPack(to_dlpack(workspace_tensor)) status = ctc.ComputeCtcLossGpu(activations=activations, gradients=gradients, flat_labels=flat_labels, label_lengths=label_lengths, input_lengths=input_lengths, alphabet_size=alphabet_size, minibatch=minibatch, costs=costs, workspace=workspace, options=info) self.assertAlmostEqual(costs[0], 1.6094379425049) self.assertAlmostEqual(costs[1], 7.355742931366) self.assertAlmostEqual(costs[2], 4.938850402832, places=6)
def test_case1(self): device = torch.device('cuda', device_id) # refer to https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md # this is the simplest case # we have one sequence with probability: [0.2, 0.2, 0.2, 0.2, 0.2] label_lengths_tensor = torch.tensor([1], dtype=torch.int32) input_lengths_tensor = torch.tensor([1], dtype=torch.int32) alphabet_size = 5 minibatch = 1 info = ctc.CtcOptions() info.loc = ctc.CtcComputeLocation.CTC_GPU info.blank_label = 0 label_lengths = kaldi.IntSubVectorFromDLPack( to_dlpack(label_lengths_tensor)) input_lengths = kaldi.IntSubVectorFromDLPack( to_dlpack(input_lengths_tensor)) status, size_in_bytes = ctc.GetWorkspaceSize( label_lengths=label_lengths, input_lengths=input_lengths, alphabet_size=alphabet_size, minibatch=minibatch, info=info) self.assertEqual(status, ctc.CtcStatus.CTC_STATUS_SUCCESS) num_floats = size_in_bytes // 4 + 1 workspace_tensor = torch.empty( num_floats, dtype=torch.float32).contiguous().to(device) activations_tensor = torch.tensor( [0.2, 0.2, 0.2, 0.2, 0.2], dtype=torch.float32).contiguous().to(device) gradients_tensor = torch.empty_like(activations_tensor) flat_labels_tensor = torch.tensor([1], dtype=torch.int32) costs_tensor = torch.empty(minibatch, dtype=torch.float32) activations = kaldi.CuSubVectorFromDLPack(to_dlpack(activations_tensor)) gradients = kaldi.CuSubVectorFromDLPack(to_dlpack(gradients_tensor)) flat_labels = kaldi.IntSubVectorFromDLPack( to_dlpack(flat_labels_tensor)) costs = kaldi.FloatSubVectorFromDLPack(to_dlpack(costs_tensor)) workspace = kaldi.CuSubVectorFromDLPack(to_dlpack(workspace_tensor)) stream = torch.cuda.default_stream(device) with torch.cuda.stream(stream): status = ctc.ComputeCtcLossGpu(activations=activations, gradients=gradients, flat_labels=flat_labels, label_lengths=label_lengths, input_lengths=input_lengths, alphabet_size=alphabet_size, minibatch=minibatch, costs=costs, workspace=workspace, options=info) # 1.6094379425049 is copied from # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md self.assertAlmostEqual(costs[0], 1.6094379425049)
def test_case3(self): device = torch.device('cuda', device_id) # this is the third case # we have 3 sequences with probability: # [-5, -4, -3, -2, -1] # [-10, -9, -8, -7, -6] # [-15, -14, -13, -12, -11] label_lengths_tensor = torch.tensor([2], dtype=torch.int32) input_lengths_tensor = torch.tensor([3], dtype=torch.int32) alphabet_size = 5 minibatch = 1 info = ctc.CtcOptions() info.loc = ctc.CtcComputeLocation.CTC_GPU info.blank_label = 0 label_lengths = kaldi.IntSubVectorFromDLPack( to_dlpack(label_lengths_tensor)) input_lengths = kaldi.IntSubVectorFromDLPack( to_dlpack(input_lengths_tensor)) status, size_in_bytes = ctc.GetWorkspaceSize( label_lengths=label_lengths, input_lengths=input_lengths, alphabet_size=alphabet_size, minibatch=minibatch, info=info) self.assertEqual(status, ctc.CtcStatus.CTC_STATUS_SUCCESS) num_floats = size_in_bytes // 4 + 1 workspace_tensor = torch.empty( num_floats, dtype=torch.float32).contiguous().to(device) activations_tensor = torch.tensor( [[-5, -4, -3, -2, -1], [-10, -9, -8, -7, -6], [-15, -14, -13, -12, -11]], dtype=torch.float32).contiguous().view(-1).to(device) gradients_tensor = torch.empty_like(activations_tensor) # the target sequence is b c, whichis 2 3 flat_labels_tensor = torch.tensor([2, 3], dtype=torch.int32) costs_tensor = torch.empty(minibatch, dtype=torch.float32) activations = kaldi.CuSubVectorFromDLPack(to_dlpack(activations_tensor)) gradients = kaldi.CuSubVectorFromDLPack(to_dlpack(gradients_tensor)) flat_labels = kaldi.IntSubVectorFromDLPack( to_dlpack(flat_labels_tensor)) costs = kaldi.FloatSubVectorFromDLPack(to_dlpack(costs_tensor)) workspace = kaldi.CuSubVectorFromDLPack(to_dlpack(workspace_tensor)) status = ctc.ComputeCtcLossGpu(activations=activations, gradients=gradients, flat_labels=flat_labels, label_lengths=label_lengths, input_lengths=input_lengths, alphabet_size=alphabet_size, minibatch=minibatch, costs=costs, workspace=workspace, options=info) # 4.938850402832 is copied from # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md self.assertAlmostEqual(costs[0], 4.938850402832, places=6)